npm - @toolbaux/guardian - Versions diffs - 0.1.22 → 0.2.0 - Mend

@toolbaux/guardian 0.1.22 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/README.md +6 -4
package/dist/adapters/runner.js +72 -3
package/dist/adapters/typescript-adapter.js +24 -10
package/dist/benchmarking/metrics/context-coverage.js +82 -0
package/dist/benchmarking/metrics/drift-score.js +104 -0
package/dist/benchmarking/metrics/search-recall.js +207 -0
package/dist/benchmarking/metrics/token-efficiency.js +79 -0
package/dist/benchmarking/report.js +131 -0
package/dist/benchmarking/runner.js +175 -0
package/dist/benchmarking/types.js +13 -0
package/dist/cli.js +53 -10
package/dist/commands/benchmark.js +62 -0
package/dist/commands/context.js +87 -29
package/dist/commands/discrepancy.js +1 -1
package/dist/commands/doc-generate.js +1 -1
package/dist/commands/doc-html.js +1 -1
package/dist/commands/extract.js +4 -1
package/dist/commands/feature-context.js +1 -1
package/dist/commands/generate.js +83 -10
package/dist/commands/init.js +89 -56
package/dist/commands/intel.js +70 -1
package/dist/commands/mcp-serve.js +155 -316
package/dist/commands/search.js +642 -14
package/dist/config.js +1 -0
package/dist/db/embeddings.js +113 -0
package/dist/db/file-specs-store.js +174 -0
package/dist/db/fts-builder.js +390 -0
package/dist/db/index.js +55 -0
package/dist/db/specs-store.js +13 -0
package/dist/db/sqlite-specs-store.js +934 -0
package/dist/extract/codebase-intel.js +31 -2
package/dist/extract/compress.js +70 -3
package/dist/extract/context-block.js +11 -2
package/dist/extract/function-intel.js +5 -2
package/dist/extract/index.js +1 -23
package/dist/extract/writer.js +6 -0
package/package.json +4 -1

package/dist/benchmarking/report.js ADDED Viewed

@@ -0,0 +1,131 @@
+/**
+ * Guardian-Bench report formatter
+ *
+ * Renders BenchmarkSummary into human-readable text and JSON outputs.
+ * Designed for arXiv paper table extraction.
+ */
+export function renderReport(summary, format = "text") {
+    if (format === "json")
+        return JSON.stringify(summary, null, 2);
+    if (format === "markdown")
+        return renderMarkdown(summary);
+    return renderText(summary);
+}
+// ── Text ──────────────────────────────────────────────────────────────────────
+function renderText(summary) {
+    const { aggregate: agg, total_tasks, completed_tasks, failed_tasks } = summary;
+    const lines = [];
+    lines.push("Guardian-Bench Results");
+    lines.push("=".repeat(60));
+    lines.push(`Guardian version : ${summary.guardian_version}`);
+    lines.push(`Generated        : ${summary.generated_at}`);
+    lines.push(`Tasks            : ${completed_tasks}/${total_tasks} completed, ${failed_tasks} failed`);
+    lines.push("");
+    lines.push("Search Recall (k=5)");
+    lines.push("-".repeat(40));
+    lines.push(`  Mean precision@5  : ${pct(agg.search_recall.mean_precision_at_5)}`);
+    lines.push(`  Mean recall@5     : ${pct(agg.search_recall.mean_recall_at_5)}`);
+    lines.push(`  Mean F1@5         : ${pct(agg.search_recall.mean_f1_at_5)}`);
+    lines.push(`  Any-hit rate      : ${pct(agg.search_recall.any_hit_rate)}`);
+    lines.push("");
+    lines.push("Token Efficiency");
+    lines.push("-".repeat(40));
+    lines.push(`  Mean ratio        : ${agg.token_efficiency.mean_efficiency_ratio.toFixed(3)}`);
+    lines.push(`  Median ratio      : ${agg.token_efficiency.median_efficiency_ratio.toFixed(3)}`);
+    lines.push(`  Mean tokens saved : ${agg.token_efficiency.mean_tokens_saved.toLocaleString()}`);
+    lines.push(`  Total tokens saved: ${agg.token_efficiency.total_tokens_saved.toLocaleString()}`);
+    lines.push("");
+    lines.push("Drift Score");
+    lines.push("-".repeat(40));
+    lines.push(`  Mean drift increase    : ${agg.drift_score.mean_drift_increase.toFixed(3)}`);
+    lines.push(`  Tasks with patch       : ${agg.drift_score.tasks_with_patch}`);
+    lines.push(`  Stable post-patch      : ${agg.drift_score.tasks_with_stable_post_patch}`);
+    lines.push("");
+    lines.push("Context Coverage");
+    lines.push("-".repeat(40));
+    lines.push(`  Mean coverage     : ${pct(agg.context_coverage.mean_coverage)}`);
+    lines.push(`  Full coverage rate: ${pct(agg.context_coverage.full_coverage_rate)}`);
+    lines.push("");
+    if (summary.results.some(r => r.error)) {
+        lines.push("Failed Tasks");
+        lines.push("-".repeat(40));
+        for (const r of summary.results.filter(r => r.error)) {
+            lines.push(`  [${r.task_id}] ${r.error}`);
+        }
+        lines.push("");
+    }
+    return lines.join("\n");
+}
+// ── Markdown (paper table style) ─────────────────────────────────────────────
+function renderMarkdown(summary) {
+    const { aggregate: agg } = summary;
+    const lines = [];
+    lines.push(`# Guardian-Bench Results`);
+    lines.push(``);
+    lines.push(`**Guardian version:** ${summary.guardian_version} | **Tasks:** ${summary.completed_tasks}/${summary.total_tasks} | **Generated:** ${summary.generated_at}`);
+    lines.push(``);
+    lines.push(`## Aggregate Metrics`);
+    lines.push(``);
+    lines.push(`| Metric | Value |`);
+    lines.push(`|--------|-------|`);
+    lines.push(`| Search Recall — Precision@5 | ${pct(agg.search_recall.mean_precision_at_5)} |`);
+    lines.push(`| Search Recall — Recall@5 | ${pct(agg.search_recall.mean_recall_at_5)} |`);
+    lines.push(`| Search Recall — F1@5 | ${pct(agg.search_recall.mean_f1_at_5)} |`);
+    lines.push(`| Search Recall — Any-Hit Rate | ${pct(agg.search_recall.any_hit_rate)} |`);
+    lines.push(`| Token Efficiency — Mean Ratio | ${agg.token_efficiency.mean_efficiency_ratio.toFixed(3)}× |`);
+    lines.push(`| Token Efficiency — Median Ratio | ${agg.token_efficiency.median_efficiency_ratio.toFixed(3)}× |`);
+    lines.push(`| Token Efficiency — Mean Tokens Saved | ${agg.token_efficiency.mean_tokens_saved.toLocaleString()} |`);
+    lines.push(`| Drift Score — Mean Increase | ${agg.drift_score.mean_drift_increase.toFixed(3)} |`);
+    lines.push(`| Context Coverage — Mean | ${pct(agg.context_coverage.mean_coverage)} |`);
+    lines.push(`| Context Coverage — Full Coverage Rate | ${pct(agg.context_coverage.full_coverage_rate)} |`);
+    lines.push(``);
+    lines.push(`## Per-Task Results`);
+    lines.push(``);
+    lines.push(`| Task | Repo | P@5 | R@5 | F1@5 | Eff.Ratio | Coverage |`);
+    lines.push(`|------|------|-----|-----|------|-----------|----------|`);
+    for (const r of summary.results) {
+        const m = r.metrics;
+        lines.push(`| ${r.task_id} | ${r.repo} ` +
+            `| ${pct(m.search_recall.precision_at_k)} ` +
+            `| ${pct(m.search_recall.recall_at_k)} ` +
+            `| ${pct(m.search_recall.f1_at_k)} ` +
+            `| ${m.token_efficiency.efficiency_ratio.toFixed(3)}× ` +
+            `| ${pct(m.context_coverage.coverage)} |`);
+    }
+    lines.push(``);
+    return lines.join("\n");
+}
+// ── Helpers ───────────────────────────────────────────────────────────────────
+function pct(n) {
+    return `${(n * 100).toFixed(1)}%`;
+}
+/** Extract per-task rows suitable for pandas/CSV */
+export function toCSV(summary) {
+    const header = [
+        "task_id", "repo", "language", "source",
+        "precision_at_5", "recall_at_5", "f1_at_5", "any_hit",
+        "efficiency_ratio", "tokens_saved",
+        "drift_increase", "context_coverage",
+        "duration_ms", "error",
+    ].join(",");
+    const rows = summary.results.map(r => {
+        const m = r.metrics;
+        return [
+            r.task_id,
+            r.repo,
+            r.language ?? "",
+            r.source ?? "",
+            m.search_recall.precision_at_k,
+            m.search_recall.recall_at_k,
+            m.search_recall.f1_at_k,
+            m.search_recall.files_found.length > 0 ? 1 : 0,
+            m.token_efficiency.efficiency_ratio,
+            m.token_efficiency.tokens_saved,
+            m.drift_score.drift_increase ?? "",
+            m.context_coverage.coverage,
+            r.duration_ms,
+            r.error ? `"${r.error.replace(/"/g, "'")}"` : "",
+        ].join(",");
+    });
+    return [header, ...rows].join("\n");
+}

package/dist/benchmarking/runner.js ADDED Viewed

@@ -0,0 +1,175 @@
+/**
+ * Guardian-Bench runner
+ *
+ * Processes a JSONL file of BenchmarkTask entries, computes all 4 metrics
+ * for each task, and returns a BenchmarkSummary.
+ *
+ * Metrics are fully offline — no LLM API calls required.
+ */
+import fs from "node:fs/promises";
+import { measureSearchRecall } from "./metrics/search-recall.js";
+import { measureTokenEfficiency } from "./metrics/token-efficiency.js";
+import { measureDriftScore } from "./metrics/drift-score.js";
+import { measureContextCoverage } from "./metrics/context-coverage.js";
+export async function runBenchmark(options) {
+    const { tasksFile, specsDir, repoDir, k = 5, concurrency = 4 } = options;
+    // Load tasks from JSONL
+    const raw = await fs.readFile(tasksFile, "utf8");
+    const tasks = raw
+        .split("\n")
+        .map(l => l.trim())
+        .filter(l => l.length > 0 && !l.startsWith("//"))
+        .map(l => JSON.parse(l));
+    const results = [];
+    let completed = 0;
+    // Process tasks with limited concurrency
+    for (let i = 0; i < tasks.length; i += concurrency) {
+        const batch = tasks.slice(i, i + concurrency);
+        const batchResults = await Promise.all(batch.map(task => runTask(task, { specsDir, repoDir, k })));
+        for (const r of batchResults) {
+            results.push(r);
+            completed++;
+            options.onProgress?.(completed, tasks.length, r);
+        }
+    }
+    const guardianVersion = await readPackageVersion();
+    const summary = buildSummary(results, guardianVersion);
+    return summary;
+}
+async function runTask(task, opts) {
+    const start = Date.now();
+    const specsDir = opts.specsDir ?? task.specs_dir ?? ".specs";
+    const repoDir = opts.repoDir ?? task.repo_dir;
+    try {
+        const [searchRecall, tokenEfficiency, driftScore, contextCoverage] = await Promise.all([
+            measureSearchRecall({
+                specsDir,
+                query: task.query,
+                groundTruthFiles: task.ground_truth_files,
+                groundTruthSymbols: task.ground_truth_symbols,
+                k: opts.k,
+            }),
+            measureTokenEfficiency({
+                specsDir,
+                groundTruthFiles: task.ground_truth_files,
+                repoDir,
+            }),
+            measureDriftScore({
+                specsDir,
+                patch: task.patch,
+            }),
+            measureContextCoverage({
+                specsDir,
+                groundTruthFiles: task.ground_truth_files,
+                groundTruthSymbols: task.ground_truth_symbols,
+            }),
+        ]);
+        return {
+            task_id: task.id,
+            repo: task.repo,
+            language: task.language,
+            source: task.source,
+            specs_dir: specsDir,
+            metrics: { search_recall: searchRecall, token_efficiency: tokenEfficiency, drift_score: driftScore, context_coverage: contextCoverage },
+            duration_ms: Date.now() - start,
+        };
+    }
+    catch (err) {
+        const emptyEfficiency = {
+            mcp_tokens: 0, raw_file_tokens: 0, efficiency_ratio: 0,
+            tokens_saved: 0, raw_file_bytes: 0, mcp_response_bytes: 0,
+        };
+        const emptyDrift = {
+            baseline_delta: null, post_patch_delta: null, drift_increase: null,
+            baseline_status: "error", post_patch_status: "error", patch_applied: false,
+        };
+        return {
+            task_id: task.id,
+            repo: task.repo,
+            language: task.language,
+            source: task.source,
+            specs_dir: specsDir,
+            metrics: {
+                search_recall: { precision_at_k: 0, recall_at_k: 0, f1_at_k: 0, k: opts.k, files_found: [], files_missed: task.ground_truth_files, symbols_found: [], symbols_missed: task.ground_truth_symbols ?? [], result_files: [], result_symbols: [] },
+                token_efficiency: emptyEfficiency,
+                drift_score: emptyDrift,
+                context_coverage: { coverage: 0, modules_mentioned: [], modules_missing: [], files_mentioned: 0, files_total: task.ground_truth_files.length },
+            },
+            duration_ms: Date.now() - start,
+            error: err instanceof Error ? err.message : String(err),
+        };
+    }
+}
+function buildSummary(results, guardianVersion) {
+    const completed = results.filter(r => !r.error);
+    const failed = results.filter(r => r.error);
+    const aggregate = {
+        search_recall: {
+            mean_precision_at_5: mean(completed.map(r => r.metrics.search_recall.precision_at_k)),
+            mean_recall_at_5: mean(completed.map(r => r.metrics.search_recall.recall_at_k)),
+            mean_f1_at_5: mean(completed.map(r => r.metrics.search_recall.f1_at_k)),
+            any_hit_rate: completed.length > 0
+                ? completed.filter(r => r.metrics.search_recall.files_found.length > 0).length / completed.length
+                : 0,
+        },
+        token_efficiency: {
+            mean_efficiency_ratio: mean(completed.map(r => r.metrics.token_efficiency.efficiency_ratio)),
+            median_efficiency_ratio: median(completed.map(r => r.metrics.token_efficiency.efficiency_ratio)),
+            mean_tokens_saved: mean(completed.map(r => r.metrics.token_efficiency.tokens_saved)),
+            total_tokens_saved: sum(completed.map(r => r.metrics.token_efficiency.tokens_saved)),
+        },
+        drift_score: {
+            mean_drift_increase: mean(completed
+                .map(r => r.metrics.drift_score.drift_increase)
+                .filter((v) => v !== null)),
+            tasks_with_stable_post_patch: completed.filter(r => r.metrics.drift_score.post_patch_status === "stable").length,
+            tasks_with_patch: completed.filter(r => r.metrics.drift_score.patch_applied).length,
+        },
+        context_coverage: {
+            mean_coverage: mean(completed.map(r => r.metrics.context_coverage.coverage)),
+            full_coverage_rate: completed.length > 0
+                ? completed.filter(r => r.metrics.context_coverage.coverage >= 1.0).length / completed.length
+                : 0,
+        },
+    };
+    return {
+        generated_at: new Date().toISOString(),
+        guardian_version: guardianVersion,
+        total_tasks: results.length,
+        completed_tasks: completed.length,
+        failed_tasks: failed.length,
+        aggregate,
+        results,
+    };
+}
+async function readPackageVersion() {
+    try {
+        const pkgPath = new URL("../../package.json", import.meta.url).pathname;
+        const raw = await fs.readFile(pkgPath, "utf8");
+        return JSON.parse(raw).version;
+    }
+    catch {
+        return "unknown";
+    }
+}
+function mean(values) {
+    if (values.length === 0)
+        return 0;
+    return round(values.reduce((a, b) => a + b, 0) / values.length);
+}
+function median(values) {
+    if (values.length === 0)
+        return 0;
+    const sorted = [...values].sort((a, b) => a - b);
+    const mid = Math.floor(sorted.length / 2);
+    const val = sorted.length % 2 === 0
+        ? (sorted[mid - 1] + sorted[mid]) / 2
+        : sorted[mid];
+    return round(val);
+}
+function sum(values) {
+    return values.reduce((a, b) => a + b, 0);
+}
+function round(n) {
+    return Math.round(n * 1000) / 1000;
+}

package/dist/benchmarking/types.js ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * Guardian-Bench types
+ *
+ * Task format is JSONL, one task per line — compatible with HuggingFace datasets.
+ * Results are structured for direct inclusion in paper tables.
+ *
+ * Benchmark dimensions (all offline, no LLM API required):
+ *   1. Search Recall    — precision/recall of guardian_search vs ground-truth files
+ *   2. Token Efficiency — MCP response tokens vs reading ground-truth files directly
+ *   3. Drift Score      — architectural drift increase after applying a patch
+ *   4. Context Coverage — how much of architecture-context.md covers the task's modules
+ */
+export {};

package/dist/cli.js CHANGED Viewed

@@ -15,13 +15,14 @@ import { runContext } from "./commands/context.js";
 import { runGenerate } from "./commands/generate.js";
 import { runVerifyDrift } from "./commands/verify-drift.js";
 import { runAnalyzeDepth } from "./commands/analyze-depth.js";
-import { runIntel } from "./commands/intel.js";
 import { runFeatureContext } from "./commands/feature-context.js";
 import { runDocGenerate } from "./commands/doc-generate.js";
 import { runDiscrepancy } from "./commands/discrepancy.js";
 import { runDocHtml } from "./commands/doc-html.js";
 import { runInit } from "./commands/init.js";
+import { runIntel } from "./commands/intel.js";
 import { runMcpServe } from "./commands/mcp-serve.js";
+import { runBenchmarkCommand } from "./commands/benchmark.js";
 import { DEFAULT_SPECS_DIR } from "./config.js";
 const program = new Command();
 program
@@ -58,18 +59,20 @@ program
     .option("--backend-root <path>", "Path to backend root")
     .option("--frontend-root <path>", "Path to frontend root")
     .option("--output <path>", "Output directory", DEFAULT_SPECS_DIR)
-    .option("--include-file-graph", "Include file-level dependency graph", false)
+    .option("--no-file-graph", "Exclude file-level dependency graph")
     .option("--config <path>", "Path to guardian.config.json")
     .option("--docs-mode <mode>", "Docs mode (lean|full)")
+    .option("--backend <backend>", "Storage backend: 'sqlite' (default, builds guardian.db + FTS index) or 'file'")
     .action(async (projectRoot, options) => {
     await runExtract({
         projectRoot,
         backendRoot: options.backendRoot,
         frontendRoot: options.frontendRoot,
         output: options.output ?? DEFAULT_SPECS_DIR,
-        includeFileGraph: options.includeFileGraph ?? false,
+        includeFileGraph: options.fileGraph !== false,
         configPath: options.config,
-        docsMode: options.docsMode
+        docsMode: options.docsMode,
+        backend: options.backend,
     });
 });
 program
@@ -209,17 +212,31 @@ program
 });
 program
     .command("search")
-    .description("Search existing snapshots for models, endpoints, components, modules, and tasks")
+    .description("Search snapshots and intelligence files. Use --query for semantic search or a mode flag for targeted lookups.")
     .option("--input <path>", "Snapshot output directory", DEFAULT_SPECS_DIR)
-    .requiredOption("--query <text>", "Search query")
+    .option("--query <text>", "Semantic search query")
     .option("--output <path>", "Write search results to a file")
     .option("--types <items>", "Comma-separated filters: models,endpoints,components,modules,tasks")
+    .option("--verbose", "Show full grouped output instead of compact file-first format")
+    .option("--format <fmt>", "Output format for --query: text (default) or json (categorical)")
+    .option("--orient", "Return architecture-context.md as compact JSON (project map)")
+    .option("--file <path>", "Return context for a file path or endpoint (e.g. 'POST /api/auth/login')")
+    .option("--model <name>", "Return model fields, relationships, and usage (e.g. 'User')")
+    .option("--impact <path>", "Return impact analysis: what breaks if you change this file")
+    .option("--backend <backend>", "Storage backend: 'file' (default linear scan) or 'sqlite' (FTS5/BM25)")
     .action(async (options) => {
     await runSearch({
         input: options.input ?? DEFAULT_SPECS_DIR,
         query: options.query,
         output: options.output,
-        types: options.types ? [options.types] : undefined
+        types: options.types ? [options.types] : undefined,
+        verbose: options.verbose ?? false,
+        format: options.format,
+        orient: options.orient ?? false,
+        file: options.file,
+        model: options.model,
+        impact: options.impact,
+        backend: options.backend,
     });
 });
 program
@@ -262,13 +279,16 @@ program
 });
 program
     .command("intel")
-    .description("Build codebase-intelligence.json from existing snapshots")
+    .description("[deprecated] Use `guardian extract` instead")
     .option("--specs <dir>", "Snapshot output directory", DEFAULT_SPECS_DIR)
-    .option("--output <path>", "Output path for codebase-intelligence.json")
+    .option("--output <path>", "Output path for codebase-intelligence.json (file backend only)")
+    .option("--backend <backend>", "Storage backend: 'file' (default) or 'sqlite'")
     .action(async (options) => {
+    console.warn("⚠ `guardian intel` is deprecated — use `guardian extract` instead.");
     await runIntel({
         specs: options.specs,
-        output: options.output
+        output: options.output,
+        backend: options.backend,
     });
 });
 program
@@ -333,6 +353,7 @@ program
     .option("--frontend-root <path>", "Path to frontend root")
     .option("--output <path>", "Output directory", DEFAULT_SPECS_DIR)
     .option("--skip-hook", "Skip pre-commit hook installation", false)
+    .option("--backend <backend>", "Storage backend: 'file' (default) or 'sqlite' (builds guardian.db + FTS index)")
     .action(async (projectRoot, options) => {
     await runInit({
         projectRoot,
@@ -340,6 +361,28 @@ program
         frontendRoot: options.frontendRoot,
         output: options.output,
         skipHook: options.skipHook ?? false,
+        backend: options.backend,
+    });
+});
+program
+    .command("benchmark")
+    .description("Run Guardian-Bench offline evaluation suite (4 metrics, no LLM required)")
+    .requiredOption("--tasks <file>", "Path to JSONL tasks file")
+    .option("--specs <dir>", "Specs directory override for all tasks")
+    .option("--repo-dir <dir>", "Repo root directory override for all tasks")
+    .option("--output <path>", "Write report to file (in addition to stdout)")
+    .option("--format <fmt>", "Output format: text, json, markdown, csv (default: text)", "text")
+    .option("--k <n>", "k for precision/recall (default: 5)", "5")
+    .option("--concurrency <n>", "Max parallel tasks (default: 4)", "4")
+    .action(async (options) => {
+    await runBenchmarkCommand({
+        tasks: options.tasks,
+        specs: options.specs,
+        repoDir: options.repoDir,
+        output: options.output,
+        format: options.format,
+        k: options.k,
+        concurrency: options.concurrency,
     });
 });
 program

package/dist/commands/benchmark.js ADDED Viewed

@@ -0,0 +1,62 @@
+/**
+ * `guardian benchmark` — run Guardian-Bench offline evaluation suite
+ *
+ * Reads a JSONL file of tasks, computes 4 metrics per task (search recall,
+ * token efficiency, drift score, context coverage), and writes a report.
+ *
+ * Usage:
+ *   guardian benchmark --tasks tasks.jsonl --specs .specs
+ *   guardian benchmark --tasks tasks.jsonl --output results.json --format json
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import { runBenchmark } from "../benchmarking/runner.js";
+import { renderReport, toCSV } from "../benchmarking/report.js";
+export async function runBenchmarkCommand(options) {
+    const tasksFile = path.resolve(options.tasks);
+    const specsDir = options.specs ? path.resolve(options.specs) : undefined;
+    const repoDir = options.repoDir ? path.resolve(options.repoDir) : undefined;
+    const format = (options.format ?? "text");
+    const k = typeof options.k === "string" ? parseInt(options.k, 10) : (options.k ?? 5);
+    const concurrency = typeof options.concurrency === "string"
+        ? parseInt(options.concurrency, 10)
+        : (options.concurrency ?? 4);
+    // Validate tasks file
+    try {
+        await fs.access(tasksFile);
+    }
+    catch {
+        console.error(`Error: tasks file not found: ${tasksFile}`);
+        process.exit(1);
+    }
+    console.error(`Guardian-Bench: running tasks from ${tasksFile}`);
+    const summary = await runBenchmark({
+        tasksFile,
+        specsDir,
+        repoDir,
+        k,
+        concurrency,
+        onProgress(completed, total, result) {
+            const status = result.error ? "FAIL" : "OK";
+            const f1 = result.metrics.search_recall.f1_at_k.toFixed(3);
+            const cov = result.metrics.context_coverage.coverage.toFixed(3);
+            console.error(`  [${completed}/${total}] ${status} ${result.task_id} | F1@${k}=${f1} | coverage=${cov}`);
+        },
+    });
+    // Render output
+    let output;
+    if (format === "csv") {
+        output = toCSV(summary);
+    }
+    else {
+        output = renderReport(summary, format === "json" || format === "markdown" ? format : "text");
+    }
+    if (options.output) {
+        const outputPath = path.resolve(options.output);
+        await fs.mkdir(path.dirname(outputPath), { recursive: true });
+        await fs.writeFile(outputPath, output, "utf8");
+        console.error(`Wrote results to ${outputPath}`);
+    }
+    // Always print to stdout
+    console.log(output);
+}