npm - @kevinrabun/judges-cli - Versions diffs - 3.128.3 → 3.129.1 - Mend

@kevinrabun/judges-cli 3.128.3 → 3.129.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/api.d.ts +1 -0
package/dist/api.js +2 -0
package/dist/cli-dispatch.js +2 -0
package/dist/cli.js +2 -0
package/dist/commands/codify-amendments.js +28 -5
package/dist/commands/external-benchmarks.d.ts +118 -0
package/dist/commands/external-benchmarks.js +296 -0
package/dist/commands/martian-code-review-benchmark.d.ts +61 -0
package/dist/commands/martian-code-review-benchmark.js +689 -0
package/dist/commands/openssf-cve-benchmark.d.ts +96 -0
package/dist/commands/openssf-cve-benchmark.js +659 -0
package/package.json +1 -1

package/dist/api.d.ts CHANGED Viewed

@@ -74,6 +74,7 @@ export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGat
 export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
 export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
 export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
+export { convertAllToBenchmarkCases as convertMartianToBenchmarkCases } from "./commands/martian-code-review-benchmark.js";
 export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
 export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
 export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";

package/dist/api.js CHANGED Viewed

@@ -83,6 +83,8 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
 export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
 // ─── LLM Benchmark ──────────────────────────────────────────────────────────
 export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
+// ─── External Benchmarks ────────────────────────────────────────────────────
+export { convertAllToBenchmarkCases as convertMartianToBenchmarkCases } from "./commands/martian-code-review-benchmark.js";
 export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
 // Review autopilot (GitHub App / scripts)
 export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";

package/dist/cli-dispatch.js CHANGED Viewed

@@ -90,6 +90,7 @@ export const COMMAND_TABLE = {
     "event-leak": ["./commands/event-leak.js", "runEventLeak"],
     "evidence-chain": ["./commands/evidence-chain.js", "runEvidenceChain"],
     "example-leak": ["./commands/example-leak.js", "runExampleLeak"],
+    "external-benchmark": ["./commands/external-benchmarks.js", "runExternalBenchmark"],
     "exception-consistency": ["./commands/exception-consistency.js", "runExceptionConsistency"],
     "exec-report": ["./commands/exec-report.js", "runExecReport"],
     "explain-finding": ["./commands/explain-finding.js", "runExplainFinding"],
@@ -311,6 +312,7 @@ export const COMMAND_TABLE = {
     "null-safety-audit": ["./commands/null-safety-audit.js", "runNullSafetyAudit"],
     "observability-gap": ["./commands/observability-gap.js", "runObservabilityGap"],
     onboard: ["./commands/onboard.js", "runOnboard"],
+    "openssf-cve": ["./commands/openssf-cve-benchmark.js", "runOpenSSFCveBenchmark"],
     "org-metrics": ["./commands/org-metrics.js", "runOrgMetrics"],
     "org-policy": ["./commands/org-policy.js", "runOrgPolicy"],
     "over-abstraction": ["./commands/over-abstraction.js", "runOverAbstraction"],

package/dist/cli.js CHANGED Viewed

@@ -251,6 +251,8 @@ function printHelp() {
         ["judges feedback", "Track finding feedback (false positives)"],
         ["judges override", "Manage per-path rule overrides"],
         ["judges benchmark", "Run detection accuracy benchmarks"],
+        ["judges openssf-cve", "Run OpenSSF CVE Benchmark (real-world CVEs)"],
+        ["judges external-benchmark", "Run external benchmarks (OpenSSF, Martian, etc.)"],
         ["judges config", "Export/import shared team configs"],
         ["judges review", "Post inline review comments on a GitHub PR"],
         ["judges app serve", "Start GitHub App webhook server"],

package/dist/commands/codify-amendments.js CHANGED Viewed

@@ -47,17 +47,37 @@ function loadAmendments(filePath) {
         const store = JSON.parse(readFileSync(resolve(filePath), "utf8"));
         return store.amendments;
     }
-    // Try VS Code global storage
-    const appdata = process.env.APPDATA || process.env.HOME;
-    if (!appdata)
-        throw new Error("Cannot determine global storage path. Use --file to specify.");
-    const globalPath = join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
+    const globalPath = getAmendmentStorePath();
     if (!existsSync(globalPath)) {
         throw new Error(`No amendments found at ${globalPath}. Run an LLM benchmark first, or use --file.`);
     }
     const store = JSON.parse(readFileSync(globalPath, "utf8"));
     return store.amendments;
 }
+/**
+ * Resolve the path to the VS Code global storage amendment file.
+ */
+function getAmendmentStorePath(filePath) {
+    if (filePath)
+        return resolve(filePath);
+    const appdata = process.env.APPDATA || process.env.HOME;
+    if (!appdata)
+        throw new Error("Cannot determine global storage path. Use --file to specify.");
+    return join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
+}
+/**
+ * Clear the amendment store after codification to prevent double-application.
+ * Codified amendments live in the .judge.md files; keeping the runtime store
+ * causes them to be injected twice into LLM benchmark prompts.
+ */
+function clearAmendmentStore(filePath) {
+    const storePath = getAmendmentStorePath(filePath);
+    if (existsSync(storePath)) {
+        const emptyStore = { amendments: [], version: 1, history: [] };
+        writeFileSync(storePath, JSON.stringify(emptyStore, null, 2), "utf8");
+        console.log(`  🧹 Cleared amendment store at ${storePath}`);
+    }
+}
 /**
  * Codify a single amendment into a judge's .judge.md file by appending
  * to the FALSE POSITIVE AVOIDANCE section (or creating one if missing).
@@ -152,6 +172,9 @@ export function runCodifyAmendments(argv) {
     console.log("");
     console.log(`  ${dryRun ? "Would codify" : "Codified"} ${codified}/${amendments.length} amendment(s) into agent files.`);
     if (!dryRun && codified > 0) {
+        // Clear the amendment store so codified amendments aren't double-applied
+        // at runtime during the next LLM benchmark run
+        clearAmendmentStore(filePath);
         console.log("  Next steps:");
         console.log("    1. npm run generate:agents:force  — sync .ts files from .judge.md");
         console.log("    2. npm run build                  — rebuild");

package/dist/commands/external-benchmarks.d.ts ADDED Viewed

@@ -0,0 +1,118 @@
+/**
+ * External Benchmark Registry
+ *
+ * Provides a unified framework for running third-party benchmarks against
+ * Judges and producing comparable, per-suite scoring reports.
+ *
+ * Each benchmark registers as a named suite with its own adapter that knows
+ * how to load data, run evaluations, and produce a standardised result.
+ * Results are stored per-suite so they can be compared individually or
+ * aggregated into a composite scorecard.
+ *
+ * Usage:
+ *   judges external-benchmark run                       # Run all registered suites
+ *   judges external-benchmark run --suite openssf-cve   # Run one suite
+ *   judges external-benchmark list                      # List available suites
+ *   judges external-benchmark report                    # Composite report from saved results
+ */
+/**
+ * Standardised result format that every external benchmark adapter must produce.
+ * This makes results comparable across benchmarks and enables composite reports.
+ */
+export interface ExternalBenchmarkResult {
+    /** Unique suite identifier (e.g. "openssf-cve", "martian-code-review") */
+    suiteId: string;
+    /** Human-readable name */
+    suiteName: string;
+    /** URL to the benchmark's public repo / site */
+    suiteUrl: string;
+    /** ISO-8601 timestamp of this run */
+    timestamp: string;
+    /** Judges version used */
+    judgesVersion: string;
+    /** Total items evaluated (CVEs, PRs, test cases, etc.) */
+    totalItems: number;
+    /** Items successfully evaluated (excludes skipped/errored) */
+    evaluatedItems: number;
+    /** Items that could not be evaluated */
+    skippedItems: number;
+    /** Precision (0–1) */
+    precision: number;
+    /** Recall (0–1) */
+    recall: number;
+    /** F1 score (0–1) */
+    f1Score: number;
+    /** Detection / match rate (0–1) */
+    detectionRate: number;
+    /** True positives count */
+    truePositives?: number;
+    /** False positives count */
+    falsePositives?: number;
+    /** False negatives count */
+    falseNegatives?: number;
+    /** Per-category breakdown (CWE, severity, language, etc.) */
+    perCategory?: Record<string, {
+        total: number;
+        detected: number;
+        rate: number;
+    }>;
+    /** Suite-specific raw data (varies per benchmark) */
+    rawData?: unknown;
+}
+/**
+ * Configuration for a benchmark run.
+ */
+export interface BenchmarkRunConfig {
+    /** Path to the benchmark repo / data directory */
+    repoPath: string;
+    /** Restrict to a single item (CVE ID, PR URL, etc.) */
+    singleItem?: string;
+    /** Output format */
+    format?: "text" | "json" | "markdown";
+    /** Output file path */
+    outputPath?: string;
+}
+/**
+ * Adapter interface that every external benchmark must implement.
+ */
+export interface ExternalBenchmarkAdapter {
+    /** Unique suite identifier */
+    readonly suiteId: string;
+    /** Human-readable name */
+    readonly suiteName: string;
+    /** URL to the benchmark's public repo / site */
+    readonly suiteUrl: string;
+    /** Default path to look for the benchmark data */
+    readonly defaultRepoPath: string;
+    /** Short description shown in `list` command */
+    readonly description: string;
+    /**
+     * Validate that the benchmark repo/data exists at the given path.
+     * Return an error message if not, or undefined if OK.
+     */
+    validate(repoPath: string): string | undefined;
+    /**
+     * Run the benchmark and return a standardised result.
+     */
+    run(config: BenchmarkRunConfig): ExternalBenchmarkResult;
+}
+export declare function registerBenchmarkAdapter(adapter: ExternalBenchmarkAdapter): void;
+export declare function getAdapter(suiteId: string): ExternalBenchmarkAdapter | undefined;
+export declare function listAdapters(): ExternalBenchmarkAdapter[];
+export interface CompositeReport {
+    timestamp: string;
+    suites: ExternalBenchmarkResult[];
+    aggregate: {
+        totalItems: number;
+        evaluatedItems: number;
+        weightedPrecision: number;
+        weightedRecall: number;
+        weightedF1: number;
+    };
+}
+export declare function computeCompositeReport(results: ExternalBenchmarkResult[]): CompositeReport;
+export declare function formatCompositeReport(report: CompositeReport): string;
+export declare function saveResult(result: ExternalBenchmarkResult): string;
+export declare function loadLatestResult(suiteId: string): ExternalBenchmarkResult | undefined;
+export declare function loadAllLatestResults(): ExternalBenchmarkResult[];
+export declare function runExternalBenchmark(argv: string[]): Promise<void>;

package/dist/commands/external-benchmarks.js ADDED Viewed

@@ -0,0 +1,296 @@
+/**
+ * External Benchmark Registry
+ *
+ * Provides a unified framework for running third-party benchmarks against
+ * Judges and producing comparable, per-suite scoring reports.
+ *
+ * Each benchmark registers as a named suite with its own adapter that knows
+ * how to load data, run evaluations, and produce a standardised result.
+ * Results are stored per-suite so they can be compared individually or
+ * aggregated into a composite scorecard.
+ *
+ * Usage:
+ *   judges external-benchmark run                       # Run all registered suites
+ *   judges external-benchmark run --suite openssf-cve   # Run one suite
+ *   judges external-benchmark list                      # List available suites
+ *   judges external-benchmark report                    # Composite report from saved results
+ */
+import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
+import { resolve, join } from "path";
+// ─── Registry ───────────────────────────────────────────────────────────────
+const _adapters = new Map();
+export function registerBenchmarkAdapter(adapter) {
+    _adapters.set(adapter.suiteId, adapter);
+}
+export function getAdapter(suiteId) {
+    return _adapters.get(suiteId);
+}
+export function listAdapters() {
+    return [..._adapters.values()];
+}
+export function computeCompositeReport(results) {
+    let totalItems = 0;
+    let evaluatedItems = 0;
+    let weightedPrecSum = 0;
+    let weightedRecSum = 0;
+    for (const r of results) {
+        totalItems += r.totalItems;
+        evaluatedItems += r.evaluatedItems;
+        weightedPrecSum += r.precision * r.evaluatedItems;
+        weightedRecSum += r.recall * r.evaluatedItems;
+    }
+    const weightedPrecision = evaluatedItems > 0 ? weightedPrecSum / evaluatedItems : 0;
+    const weightedRecall = evaluatedItems > 0 ? weightedRecSum / evaluatedItems : 0;
+    const weightedF1 = weightedPrecision + weightedRecall > 0
+        ? (2 * weightedPrecision * weightedRecall) / (weightedPrecision + weightedRecall)
+        : 0;
+    return {
+        timestamp: new Date().toISOString(),
+        suites: results,
+        aggregate: { totalItems, evaluatedItems, weightedPrecision, weightedRecall, weightedF1 },
+    };
+}
+export function formatCompositeReport(report) {
+    const lines = [];
+    lines.push("# External Benchmark Scorecard");
+    lines.push("");
+    lines.push(`**Date:** ${report.timestamp}`);
+    lines.push("");
+    // Aggregate summary
+    lines.push("## Aggregate");
+    lines.push("");
+    lines.push("| Metric | Value |");
+    lines.push("|--------|-------|");
+    lines.push(`| Total Items | ${report.aggregate.totalItems} |`);
+    lines.push(`| Evaluated | ${report.aggregate.evaluatedItems} |`);
+    lines.push(`| Weighted Precision | ${(report.aggregate.weightedPrecision * 100).toFixed(1)}% |`);
+    lines.push(`| Weighted Recall | ${(report.aggregate.weightedRecall * 100).toFixed(1)}% |`);
+    lines.push(`| Weighted F1 | ${(report.aggregate.weightedF1 * 100).toFixed(1)}% |`);
+    lines.push("");
+    // Per-suite table
+    lines.push("## Per-Suite Results");
+    lines.push("");
+    lines.push("| Suite | Items | Detection Rate | Precision | Recall | F1 |");
+    lines.push("|-------|-------|---------------|-----------|--------|-----|");
+    for (const s of report.suites) {
+        lines.push(`| [${s.suiteName}](${s.suiteUrl}) | ${s.evaluatedItems}/${s.totalItems} ` +
+            `| ${(s.detectionRate * 100).toFixed(1)}% ` +
+            `| ${(s.precision * 100).toFixed(1)}% ` +
+            `| ${(s.recall * 100).toFixed(1)}% ` +
+            `| ${(s.f1Score * 100).toFixed(1)}% |`);
+    }
+    lines.push("");
+    // Per-suite detail sections
+    for (const s of report.suites) {
+        lines.push(`## ${s.suiteName}`);
+        lines.push("");
+        lines.push(`**Source:** ${s.suiteUrl}`);
+        lines.push(`**Items:** ${s.evaluatedItems} evaluated, ${s.skippedItems} skipped`);
+        lines.push("");
+        if (s.perCategory && Object.keys(s.perCategory).length > 0) {
+            lines.push("| Category | Total | Detected | Rate |");
+            lines.push("|----------|-------|----------|------|");
+            const entries = Object.entries(s.perCategory).sort((a, b) => b[1].total - a[1].total);
+            for (const [cat, data] of entries) {
+                lines.push(`| ${cat} | ${data.total} | ${data.detected} | ${(data.rate * 100).toFixed(0)}% |`);
+            }
+            lines.push("");
+        }
+    }
+    return lines.join("\n");
+}
+// ─── Result Persistence ─────────────────────────────────────────────────────
+const RESULTS_DIR = "benchmarks/external";
+function ensureResultsDir() {
+    const dir = resolve(RESULTS_DIR);
+    if (!existsSync(dir)) {
+        mkdirSync(dir, { recursive: true });
+    }
+    return dir;
+}
+export function saveResult(result) {
+    const dir = ensureResultsDir();
+    const fileName = `${result.suiteId}-${result.timestamp.replace(/[:.]/g, "-")}.json`;
+    const filePath = join(dir, fileName);
+    writeFileSync(filePath, JSON.stringify(result, null, 2), "utf-8");
+    // Also write a "latest" symlink-style file
+    const latestPath = join(dir, `${result.suiteId}-latest.json`);
+    writeFileSync(latestPath, JSON.stringify(result, null, 2), "utf-8");
+    return filePath;
+}
+export function loadLatestResult(suiteId) {
+    const latestPath = resolve(RESULTS_DIR, `${suiteId}-latest.json`);
+    if (!existsSync(latestPath))
+        return undefined;
+    return JSON.parse(readFileSync(latestPath, "utf-8"));
+}
+export function loadAllLatestResults() {
+    const results = [];
+    for (const adapter of listAdapters()) {
+        const r = loadLatestResult(adapter.suiteId);
+        if (r)
+            results.push(r);
+    }
+    return results;
+}
+// ─── CLI Entry Point ────────────────────────────────────────────────────────
+// Ensure adapters are registered when the CLI entry point is called.
+// Each adapter file calls registerBenchmarkAdapter() at module scope.
+let _adaptersLoaded = false;
+async function ensureAdaptersLoaded() {
+    if (_adaptersLoaded)
+        return;
+    _adaptersLoaded = true;
+    try {
+        await import("./openssf-cve-benchmark.js");
+    }
+    catch {
+        /* adapter unavailable */
+    }
+    try {
+        await import("./martian-code-review-benchmark.js");
+    }
+    catch {
+        /* adapter unavailable */
+    }
+}
+export async function runExternalBenchmark(argv) {
+    await ensureAdaptersLoaded();
+    const subcommand = argv[3] || "run";
+    if (subcommand === "--help" || subcommand === "-h") {
+        console.log(`
+Judges Panel — External Benchmark Runner
+Run third-party benchmarks to demonstrate Judges' capabilities and produce
+comparable, per-suite scoring reports.
+USAGE:
+  judges external-benchmark run [options]     Run benchmark suite(s)
+  judges external-benchmark list              List available suites
+  judges external-benchmark report [options]  Composite report from saved results
+OPTIONS:
+  --suite, -s <id>       Run a specific suite (default: all)
+  --repo, -r <path>      Override the benchmark repo path
+  --item <id>            Evaluate a single item (CVE ID, PR URL, etc.)
+  --output, -o <path>    Save results to file
+  --format <fmt>         Output: text, json, markdown (default: text)
+AVAILABLE SUITES:`);
+        for (const a of listAdapters()) {
+            console.log(`  ${a.suiteId.padEnd(24)} ${a.description}`);
+        }
+        console.log("");
+        process.exit(0);
+    }
+    if (subcommand === "list") {
+        console.log("\nAvailable external benchmark suites:\n");
+        for (const a of listAdapters()) {
+            console.log(`  ${a.suiteId.padEnd(24)} ${a.suiteName}`);
+            console.log(`  ${"".padEnd(24)} ${a.description}`);
+            console.log(`  ${"".padEnd(24)} ${a.suiteUrl}`);
+            console.log(`  ${"".padEnd(24)} Default path: ${a.defaultRepoPath}`);
+            console.log("");
+        }
+        return;
+    }
+    if (subcommand === "report") {
+        const results = loadAllLatestResults();
+        if (results.length === 0) {
+            console.error("No saved results found. Run benchmarks first with: judges external-benchmark run");
+            process.exit(1);
+        }
+        const report = computeCompositeReport(results);
+        let reportFormat = "markdown";
+        let outputPath;
+        for (let i = 4; i < argv.length; i++) {
+            if (argv[i] === "--format")
+                reportFormat = argv[++i];
+            else if (argv[i] === "--output" || argv[i] === "-o")
+                outputPath = argv[++i];
+        }
+        const output = reportFormat === "json" ? JSON.stringify(report, null, 2) : formatCompositeReport(report);
+        if (outputPath) {
+            writeFileSync(outputPath, output, "utf-8");
+            console.log(`Report saved to ${outputPath}`);
+        }
+        else {
+            console.log(output);
+        }
+        return;
+    }
+    // ── "run" subcommand ──
+    let suiteId;
+    let repoPath;
+    let singleItem;
+    let format = "text";
+    let outputPath;
+    for (let i = 4; i < argv.length; i++) {
+        const arg = argv[i];
+        if (arg === "--suite" || arg === "-s")
+            suiteId = argv[++i];
+        else if (arg === "--repo" || arg === "-r")
+            repoPath = argv[++i];
+        else if (arg === "--item")
+            singleItem = argv[++i];
+        else if (arg === "--output" || arg === "-o")
+            outputPath = argv[++i];
+        else if (arg === "--format")
+            format = argv[++i];
+    }
+    const adapters = suiteId ? [getAdapter(suiteId)].filter(Boolean) : listAdapters();
+    if (adapters.length === 0) {
+        if (suiteId) {
+            console.error(`Unknown suite: ${suiteId}. Use 'judges external-benchmark list' to see available suites.`);
+        }
+        else {
+            console.error("No benchmark adapters registered.");
+        }
+        process.exit(1);
+    }
+    const allResults = [];
+    for (const adapter of adapters) {
+        const effectiveRepo = repoPath ? resolve(repoPath) : resolve(adapter.defaultRepoPath);
+        console.log(`\n━━━ ${adapter.suiteName} ━━━`);
+        console.log(`Suite: ${adapter.suiteId}`);
+        console.log(`Repo:  ${effectiveRepo}`);
+        const validationError = adapter.validate(effectiveRepo);
+        if (validationError) {
+            console.error(`  ⚠️  ${validationError}`);
+            console.error(`  Skipping ${adapter.suiteId}.\n`);
+            continue;
+        }
+        const result = adapter.run({
+            repoPath: effectiveRepo,
+            singleItem,
+            format: format,
+            outputPath,
+        });
+        allResults.push(result);
+        // Save per-suite result
+        const savedPath = saveResult(result);
+        console.log(`\n  Results saved to ${savedPath}`);
+        // Print per-suite summary
+        console.log(`\n  Detection Rate: ${(result.detectionRate * 100).toFixed(1)}%`);
+        console.log(`  Precision:      ${(result.precision * 100).toFixed(1)}%`);
+        console.log(`  Recall:         ${(result.recall * 100).toFixed(1)}%`);
+        console.log(`  F1 Score:       ${(result.f1Score * 100).toFixed(1)}%`);
+    }
+    // Composite summary if multiple suites ran
+    if (allResults.length > 1) {
+        const report = computeCompositeReport(allResults);
+        console.log("\n━━━ Composite Scorecard ━━━");
+        console.log(`  Weighted Precision: ${(report.aggregate.weightedPrecision * 100).toFixed(1)}%`);
+        console.log(`  Weighted Recall:    ${(report.aggregate.weightedRecall * 100).toFixed(1)}%`);
+        console.log(`  Weighted F1:        ${(report.aggregate.weightedF1 * 100).toFixed(1)}%`);
+    }
+    if (outputPath && allResults.length > 0) {
+        const finalOutput = format === "json"
+            ? JSON.stringify(allResults.length === 1 ? allResults[0] : computeCompositeReport(allResults), null, 2)
+            : format === "markdown"
+                ? formatCompositeReport(computeCompositeReport(allResults))
+                : allResults.map((r) => `${r.suiteName}: F1=${(r.f1Score * 100).toFixed(1)}%`).join("\n");
+        writeFileSync(outputPath, finalOutput, "utf-8");
+        console.log(`\nResults saved to ${outputPath}`);
+    }
+}

package/dist/commands/martian-code-review-benchmark.d.ts ADDED Viewed

@@ -0,0 +1,61 @@
+/**
+ * Martian Code Review Benchmark Integration
+ *
+ * Adapter for the Martian Code Review Bench offline benchmark
+ * (https://github.com/withmartian/code-review-benchmark).
+ *
+ * 50 PRs from 5 major open-source projects (Sentry, Grafana, Cal.com,
+ * Discourse, Keycloak) with human-curated golden comments at severity
+ * levels Low/Medium/High/Critical.
+ *
+ * For each PR, Judges evaluates the diff and we match our findings
+ * against the golden comments using semantic similarity at the
+ * rule-prefix and description level.
+ */
+import type { Finding } from "../types.js";
+import type { BenchmarkCase } from "./benchmark.js";
+export interface MartianGoldenComment {
+    comment: string;
+    severity: "Low" | "Medium" | "High" | "Critical";
+}
+export interface MartianPr {
+    pr_title: string;
+    url: string;
+    original_url?: string;
+    az_comment?: string;
+    comments: MartianGoldenComment[];
+}
+export interface MartianPrResult {
+    prTitle: string;
+    prUrl: string;
+    sourceRepo: string;
+    language: string;
+    goldenComments: number;
+    matchedComments: number;
+    unmatchedComments: number;
+    falsePositives: number;
+    precision: number;
+    recall: number;
+    findings: Finding[];
+    matches: Array<{
+        golden: string;
+        finding: string;
+        severity: string;
+    }>;
+    missed: string[];
+}
+export declare function loadGoldenComments(repoPath: string): Map<string, MartianPr[]>;
+/**
+ * Convert a Martian PR with golden comments into BenchmarkCase format
+ * for use in the LLM benchmark pipeline.
+ *
+ * Each golden comment becomes an expected finding. The PR diff provides
+ * the actual code to evaluate. The LLM judge determines if its review
+ * catches the same issues the human reviewer identified.
+ */
+export declare function convertPrToBenchmarkCase(pr: MartianPr, repoName: string, diff?: string): BenchmarkCase | undefined;
+/**
+ * Convert all Martian golden comments into BenchmarkCase[] for LLM evaluation.
+ * Fetches actual PR diffs from GitHub when possible.
+ */
+export declare function convertAllToBenchmarkCases(repoPath: string): BenchmarkCase[];