@toolbaux/guardian 0.1.22 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +6 -4
  2. package/dist/adapters/runner.js +72 -3
  3. package/dist/adapters/typescript-adapter.js +24 -10
  4. package/dist/benchmarking/metrics/context-coverage.js +82 -0
  5. package/dist/benchmarking/metrics/drift-score.js +104 -0
  6. package/dist/benchmarking/metrics/search-recall.js +207 -0
  7. package/dist/benchmarking/metrics/token-efficiency.js +79 -0
  8. package/dist/benchmarking/report.js +131 -0
  9. package/dist/benchmarking/runner.js +175 -0
  10. package/dist/benchmarking/types.js +13 -0
  11. package/dist/cli.js +53 -10
  12. package/dist/commands/benchmark.js +62 -0
  13. package/dist/commands/context.js +87 -29
  14. package/dist/commands/discrepancy.js +1 -1
  15. package/dist/commands/doc-generate.js +1 -1
  16. package/dist/commands/doc-html.js +1 -1
  17. package/dist/commands/extract.js +4 -1
  18. package/dist/commands/feature-context.js +1 -1
  19. package/dist/commands/generate.js +83 -10
  20. package/dist/commands/init.js +89 -56
  21. package/dist/commands/intel.js +70 -1
  22. package/dist/commands/mcp-serve.js +155 -316
  23. package/dist/commands/search.js +642 -14
  24. package/dist/config.js +1 -0
  25. package/dist/db/embeddings.js +113 -0
  26. package/dist/db/file-specs-store.js +174 -0
  27. package/dist/db/fts-builder.js +390 -0
  28. package/dist/db/index.js +55 -0
  29. package/dist/db/specs-store.js +13 -0
  30. package/dist/db/sqlite-specs-store.js +934 -0
  31. package/dist/extract/codebase-intel.js +31 -2
  32. package/dist/extract/compress.js +70 -3
  33. package/dist/extract/context-block.js +11 -2
  34. package/dist/extract/function-intel.js +5 -2
  35. package/dist/extract/index.js +1 -23
  36. package/dist/extract/writer.js +6 -0
  37. package/package.json +4 -1
@@ -0,0 +1,131 @@
1
+ /**
2
+ * Guardian-Bench report formatter
3
+ *
4
+ * Renders BenchmarkSummary into human-readable text and JSON outputs.
5
+ * Designed for arXiv paper table extraction.
6
+ */
7
+ export function renderReport(summary, format = "text") {
8
+ if (format === "json")
9
+ return JSON.stringify(summary, null, 2);
10
+ if (format === "markdown")
11
+ return renderMarkdown(summary);
12
+ return renderText(summary);
13
+ }
14
+ // ── Text ──────────────────────────────────────────────────────────────────────
15
+ function renderText(summary) {
16
+ const { aggregate: agg, total_tasks, completed_tasks, failed_tasks } = summary;
17
+ const lines = [];
18
+ lines.push("Guardian-Bench Results");
19
+ lines.push("=".repeat(60));
20
+ lines.push(`Guardian version : ${summary.guardian_version}`);
21
+ lines.push(`Generated : ${summary.generated_at}`);
22
+ lines.push(`Tasks : ${completed_tasks}/${total_tasks} completed, ${failed_tasks} failed`);
23
+ lines.push("");
24
+ lines.push("Search Recall (k=5)");
25
+ lines.push("-".repeat(40));
26
+ lines.push(` Mean precision@5 : ${pct(agg.search_recall.mean_precision_at_5)}`);
27
+ lines.push(` Mean recall@5 : ${pct(agg.search_recall.mean_recall_at_5)}`);
28
+ lines.push(` Mean F1@5 : ${pct(agg.search_recall.mean_f1_at_5)}`);
29
+ lines.push(` Any-hit rate : ${pct(agg.search_recall.any_hit_rate)}`);
30
+ lines.push("");
31
+ lines.push("Token Efficiency");
32
+ lines.push("-".repeat(40));
33
+ lines.push(` Mean ratio : ${agg.token_efficiency.mean_efficiency_ratio.toFixed(3)}`);
34
+ lines.push(` Median ratio : ${agg.token_efficiency.median_efficiency_ratio.toFixed(3)}`);
35
+ lines.push(` Mean tokens saved : ${agg.token_efficiency.mean_tokens_saved.toLocaleString()}`);
36
+ lines.push(` Total tokens saved: ${agg.token_efficiency.total_tokens_saved.toLocaleString()}`);
37
+ lines.push("");
38
+ lines.push("Drift Score");
39
+ lines.push("-".repeat(40));
40
+ lines.push(` Mean drift increase : ${agg.drift_score.mean_drift_increase.toFixed(3)}`);
41
+ lines.push(` Tasks with patch : ${agg.drift_score.tasks_with_patch}`);
42
+ lines.push(` Stable post-patch : ${agg.drift_score.tasks_with_stable_post_patch}`);
43
+ lines.push("");
44
+ lines.push("Context Coverage");
45
+ lines.push("-".repeat(40));
46
+ lines.push(` Mean coverage : ${pct(agg.context_coverage.mean_coverage)}`);
47
+ lines.push(` Full coverage rate: ${pct(agg.context_coverage.full_coverage_rate)}`);
48
+ lines.push("");
49
+ if (summary.results.some(r => r.error)) {
50
+ lines.push("Failed Tasks");
51
+ lines.push("-".repeat(40));
52
+ for (const r of summary.results.filter(r => r.error)) {
53
+ lines.push(` [${r.task_id}] ${r.error}`);
54
+ }
55
+ lines.push("");
56
+ }
57
+ return lines.join("\n");
58
+ }
59
+ // ── Markdown (paper table style) ─────────────────────────────────────────────
60
+ function renderMarkdown(summary) {
61
+ const { aggregate: agg } = summary;
62
+ const lines = [];
63
+ lines.push(`# Guardian-Bench Results`);
64
+ lines.push(``);
65
+ lines.push(`**Guardian version:** ${summary.guardian_version} | **Tasks:** ${summary.completed_tasks}/${summary.total_tasks} | **Generated:** ${summary.generated_at}`);
66
+ lines.push(``);
67
+ lines.push(`## Aggregate Metrics`);
68
+ lines.push(``);
69
+ lines.push(`| Metric | Value |`);
70
+ lines.push(`|--------|-------|`);
71
+ lines.push(`| Search Recall — Precision@5 | ${pct(agg.search_recall.mean_precision_at_5)} |`);
72
+ lines.push(`| Search Recall — Recall@5 | ${pct(agg.search_recall.mean_recall_at_5)} |`);
73
+ lines.push(`| Search Recall — F1@5 | ${pct(agg.search_recall.mean_f1_at_5)} |`);
74
+ lines.push(`| Search Recall — Any-Hit Rate | ${pct(agg.search_recall.any_hit_rate)} |`);
75
+ lines.push(`| Token Efficiency — Mean Ratio | ${agg.token_efficiency.mean_efficiency_ratio.toFixed(3)}× |`);
76
+ lines.push(`| Token Efficiency — Median Ratio | ${agg.token_efficiency.median_efficiency_ratio.toFixed(3)}× |`);
77
+ lines.push(`| Token Efficiency — Mean Tokens Saved | ${agg.token_efficiency.mean_tokens_saved.toLocaleString()} |`);
78
+ lines.push(`| Drift Score — Mean Increase | ${agg.drift_score.mean_drift_increase.toFixed(3)} |`);
79
+ lines.push(`| Context Coverage — Mean | ${pct(agg.context_coverage.mean_coverage)} |`);
80
+ lines.push(`| Context Coverage — Full Coverage Rate | ${pct(agg.context_coverage.full_coverage_rate)} |`);
81
+ lines.push(``);
82
+ lines.push(`## Per-Task Results`);
83
+ lines.push(``);
84
+ lines.push(`| Task | Repo | P@5 | R@5 | F1@5 | Eff.Ratio | Coverage |`);
85
+ lines.push(`|------|------|-----|-----|------|-----------|----------|`);
86
+ for (const r of summary.results) {
87
+ const m = r.metrics;
88
+ lines.push(`| ${r.task_id} | ${r.repo} ` +
89
+ `| ${pct(m.search_recall.precision_at_k)} ` +
90
+ `| ${pct(m.search_recall.recall_at_k)} ` +
91
+ `| ${pct(m.search_recall.f1_at_k)} ` +
92
+ `| ${m.token_efficiency.efficiency_ratio.toFixed(3)}× ` +
93
+ `| ${pct(m.context_coverage.coverage)} |`);
94
+ }
95
+ lines.push(``);
96
+ return lines.join("\n");
97
+ }
98
+ // ── Helpers ───────────────────────────────────────────────────────────────────
99
+ function pct(n) {
100
+ return `${(n * 100).toFixed(1)}%`;
101
+ }
102
+ /** Extract per-task rows suitable for pandas/CSV */
103
+ export function toCSV(summary) {
104
+ const header = [
105
+ "task_id", "repo", "language", "source",
106
+ "precision_at_5", "recall_at_5", "f1_at_5", "any_hit",
107
+ "efficiency_ratio", "tokens_saved",
108
+ "drift_increase", "context_coverage",
109
+ "duration_ms", "error",
110
+ ].join(",");
111
+ const rows = summary.results.map(r => {
112
+ const m = r.metrics;
113
+ return [
114
+ r.task_id,
115
+ r.repo,
116
+ r.language ?? "",
117
+ r.source ?? "",
118
+ m.search_recall.precision_at_k,
119
+ m.search_recall.recall_at_k,
120
+ m.search_recall.f1_at_k,
121
+ m.search_recall.files_found.length > 0 ? 1 : 0,
122
+ m.token_efficiency.efficiency_ratio,
123
+ m.token_efficiency.tokens_saved,
124
+ m.drift_score.drift_increase ?? "",
125
+ m.context_coverage.coverage,
126
+ r.duration_ms,
127
+ r.error ? `"${r.error.replace(/"/g, "'")}"` : "",
128
+ ].join(",");
129
+ });
130
+ return [header, ...rows].join("\n");
131
+ }
@@ -0,0 +1,175 @@
1
+ /**
2
+ * Guardian-Bench runner
3
+ *
4
+ * Processes a JSONL file of BenchmarkTask entries, computes all 4 metrics
5
+ * for each task, and returns a BenchmarkSummary.
6
+ *
7
+ * Metrics are fully offline — no LLM API calls required.
8
+ */
9
+ import fs from "node:fs/promises";
10
+ import { measureSearchRecall } from "./metrics/search-recall.js";
11
+ import { measureTokenEfficiency } from "./metrics/token-efficiency.js";
12
+ import { measureDriftScore } from "./metrics/drift-score.js";
13
+ import { measureContextCoverage } from "./metrics/context-coverage.js";
14
+ export async function runBenchmark(options) {
15
+ const { tasksFile, specsDir, repoDir, k = 5, concurrency = 4 } = options;
16
+ // Load tasks from JSONL
17
+ const raw = await fs.readFile(tasksFile, "utf8");
18
+ const tasks = raw
19
+ .split("\n")
20
+ .map(l => l.trim())
21
+ .filter(l => l.length > 0 && !l.startsWith("//"))
22
+ .map(l => JSON.parse(l));
23
+ const results = [];
24
+ let completed = 0;
25
+ // Process tasks with limited concurrency
26
+ for (let i = 0; i < tasks.length; i += concurrency) {
27
+ const batch = tasks.slice(i, i + concurrency);
28
+ const batchResults = await Promise.all(batch.map(task => runTask(task, { specsDir, repoDir, k })));
29
+ for (const r of batchResults) {
30
+ results.push(r);
31
+ completed++;
32
+ options.onProgress?.(completed, tasks.length, r);
33
+ }
34
+ }
35
+ const guardianVersion = await readPackageVersion();
36
+ const summary = buildSummary(results, guardianVersion);
37
+ return summary;
38
+ }
39
+ async function runTask(task, opts) {
40
+ const start = Date.now();
41
+ const specsDir = opts.specsDir ?? task.specs_dir ?? ".specs";
42
+ const repoDir = opts.repoDir ?? task.repo_dir;
43
+ try {
44
+ const [searchRecall, tokenEfficiency, driftScore, contextCoverage] = await Promise.all([
45
+ measureSearchRecall({
46
+ specsDir,
47
+ query: task.query,
48
+ groundTruthFiles: task.ground_truth_files,
49
+ groundTruthSymbols: task.ground_truth_symbols,
50
+ k: opts.k,
51
+ }),
52
+ measureTokenEfficiency({
53
+ specsDir,
54
+ groundTruthFiles: task.ground_truth_files,
55
+ repoDir,
56
+ }),
57
+ measureDriftScore({
58
+ specsDir,
59
+ patch: task.patch,
60
+ }),
61
+ measureContextCoverage({
62
+ specsDir,
63
+ groundTruthFiles: task.ground_truth_files,
64
+ groundTruthSymbols: task.ground_truth_symbols,
65
+ }),
66
+ ]);
67
+ return {
68
+ task_id: task.id,
69
+ repo: task.repo,
70
+ language: task.language,
71
+ source: task.source,
72
+ specs_dir: specsDir,
73
+ metrics: { search_recall: searchRecall, token_efficiency: tokenEfficiency, drift_score: driftScore, context_coverage: contextCoverage },
74
+ duration_ms: Date.now() - start,
75
+ };
76
+ }
77
+ catch (err) {
78
+ const emptyEfficiency = {
79
+ mcp_tokens: 0, raw_file_tokens: 0, efficiency_ratio: 0,
80
+ tokens_saved: 0, raw_file_bytes: 0, mcp_response_bytes: 0,
81
+ };
82
+ const emptyDrift = {
83
+ baseline_delta: null, post_patch_delta: null, drift_increase: null,
84
+ baseline_status: "error", post_patch_status: "error", patch_applied: false,
85
+ };
86
+ return {
87
+ task_id: task.id,
88
+ repo: task.repo,
89
+ language: task.language,
90
+ source: task.source,
91
+ specs_dir: specsDir,
92
+ metrics: {
93
+ search_recall: { precision_at_k: 0, recall_at_k: 0, f1_at_k: 0, k: opts.k, files_found: [], files_missed: task.ground_truth_files, symbols_found: [], symbols_missed: task.ground_truth_symbols ?? [], result_files: [], result_symbols: [] },
94
+ token_efficiency: emptyEfficiency,
95
+ drift_score: emptyDrift,
96
+ context_coverage: { coverage: 0, modules_mentioned: [], modules_missing: [], files_mentioned: 0, files_total: task.ground_truth_files.length },
97
+ },
98
+ duration_ms: Date.now() - start,
99
+ error: err instanceof Error ? err.message : String(err),
100
+ };
101
+ }
102
+ }
103
+ function buildSummary(results, guardianVersion) {
104
+ const completed = results.filter(r => !r.error);
105
+ const failed = results.filter(r => r.error);
106
+ const aggregate = {
107
+ search_recall: {
108
+ mean_precision_at_5: mean(completed.map(r => r.metrics.search_recall.precision_at_k)),
109
+ mean_recall_at_5: mean(completed.map(r => r.metrics.search_recall.recall_at_k)),
110
+ mean_f1_at_5: mean(completed.map(r => r.metrics.search_recall.f1_at_k)),
111
+ any_hit_rate: completed.length > 0
112
+ ? completed.filter(r => r.metrics.search_recall.files_found.length > 0).length / completed.length
113
+ : 0,
114
+ },
115
+ token_efficiency: {
116
+ mean_efficiency_ratio: mean(completed.map(r => r.metrics.token_efficiency.efficiency_ratio)),
117
+ median_efficiency_ratio: median(completed.map(r => r.metrics.token_efficiency.efficiency_ratio)),
118
+ mean_tokens_saved: mean(completed.map(r => r.metrics.token_efficiency.tokens_saved)),
119
+ total_tokens_saved: sum(completed.map(r => r.metrics.token_efficiency.tokens_saved)),
120
+ },
121
+ drift_score: {
122
+ mean_drift_increase: mean(completed
123
+ .map(r => r.metrics.drift_score.drift_increase)
124
+ .filter((v) => v !== null)),
125
+ tasks_with_stable_post_patch: completed.filter(r => r.metrics.drift_score.post_patch_status === "stable").length,
126
+ tasks_with_patch: completed.filter(r => r.metrics.drift_score.patch_applied).length,
127
+ },
128
+ context_coverage: {
129
+ mean_coverage: mean(completed.map(r => r.metrics.context_coverage.coverage)),
130
+ full_coverage_rate: completed.length > 0
131
+ ? completed.filter(r => r.metrics.context_coverage.coverage >= 1.0).length / completed.length
132
+ : 0,
133
+ },
134
+ };
135
+ return {
136
+ generated_at: new Date().toISOString(),
137
+ guardian_version: guardianVersion,
138
+ total_tasks: results.length,
139
+ completed_tasks: completed.length,
140
+ failed_tasks: failed.length,
141
+ aggregate,
142
+ results,
143
+ };
144
+ }
145
+ async function readPackageVersion() {
146
+ try {
147
+ const pkgPath = new URL("../../package.json", import.meta.url).pathname;
148
+ const raw = await fs.readFile(pkgPath, "utf8");
149
+ return JSON.parse(raw).version;
150
+ }
151
+ catch {
152
+ return "unknown";
153
+ }
154
+ }
155
+ function mean(values) {
156
+ if (values.length === 0)
157
+ return 0;
158
+ return round(values.reduce((a, b) => a + b, 0) / values.length);
159
+ }
160
+ function median(values) {
161
+ if (values.length === 0)
162
+ return 0;
163
+ const sorted = [...values].sort((a, b) => a - b);
164
+ const mid = Math.floor(sorted.length / 2);
165
+ const val = sorted.length % 2 === 0
166
+ ? (sorted[mid - 1] + sorted[mid]) / 2
167
+ : sorted[mid];
168
+ return round(val);
169
+ }
170
+ function sum(values) {
171
+ return values.reduce((a, b) => a + b, 0);
172
+ }
173
+ function round(n) {
174
+ return Math.round(n * 1000) / 1000;
175
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Guardian-Bench types
3
+ *
4
+ * Task format is JSONL, one task per line — compatible with HuggingFace datasets.
5
+ * Results are structured for direct inclusion in paper tables.
6
+ *
7
+ * Benchmark dimensions (all offline, no LLM API required):
8
+ * 1. Search Recall — precision/recall of guardian_search vs ground-truth files
9
+ * 2. Token Efficiency — MCP response tokens vs reading ground-truth files directly
10
+ * 3. Drift Score — architectural drift increase after applying a patch
11
+ * 4. Context Coverage — how much of architecture-context.md covers the task's modules
12
+ */
13
+ export {};
package/dist/cli.js CHANGED
@@ -15,13 +15,14 @@ import { runContext } from "./commands/context.js";
15
15
  import { runGenerate } from "./commands/generate.js";
16
16
  import { runVerifyDrift } from "./commands/verify-drift.js";
17
17
  import { runAnalyzeDepth } from "./commands/analyze-depth.js";
18
- import { runIntel } from "./commands/intel.js";
19
18
  import { runFeatureContext } from "./commands/feature-context.js";
20
19
  import { runDocGenerate } from "./commands/doc-generate.js";
21
20
  import { runDiscrepancy } from "./commands/discrepancy.js";
22
21
  import { runDocHtml } from "./commands/doc-html.js";
23
22
  import { runInit } from "./commands/init.js";
23
+ import { runIntel } from "./commands/intel.js";
24
24
  import { runMcpServe } from "./commands/mcp-serve.js";
25
+ import { runBenchmarkCommand } from "./commands/benchmark.js";
25
26
  import { DEFAULT_SPECS_DIR } from "./config.js";
26
27
  const program = new Command();
27
28
  program
@@ -58,18 +59,20 @@ program
58
59
  .option("--backend-root <path>", "Path to backend root")
59
60
  .option("--frontend-root <path>", "Path to frontend root")
60
61
  .option("--output <path>", "Output directory", DEFAULT_SPECS_DIR)
61
- .option("--include-file-graph", "Include file-level dependency graph", false)
62
+ .option("--no-file-graph", "Exclude file-level dependency graph")
62
63
  .option("--config <path>", "Path to guardian.config.json")
63
64
  .option("--docs-mode <mode>", "Docs mode (lean|full)")
65
+ .option("--backend <backend>", "Storage backend: 'sqlite' (default, builds guardian.db + FTS index) or 'file'")
64
66
  .action(async (projectRoot, options) => {
65
67
  await runExtract({
66
68
  projectRoot,
67
69
  backendRoot: options.backendRoot,
68
70
  frontendRoot: options.frontendRoot,
69
71
  output: options.output ?? DEFAULT_SPECS_DIR,
70
- includeFileGraph: options.includeFileGraph ?? false,
72
+ includeFileGraph: options.fileGraph !== false,
71
73
  configPath: options.config,
72
- docsMode: options.docsMode
74
+ docsMode: options.docsMode,
75
+ backend: options.backend,
73
76
  });
74
77
  });
75
78
  program
@@ -209,17 +212,31 @@ program
209
212
  });
210
213
  program
211
214
  .command("search")
212
- .description("Search existing snapshots for models, endpoints, components, modules, and tasks")
215
+ .description("Search snapshots and intelligence files. Use --query for semantic search or a mode flag for targeted lookups.")
213
216
  .option("--input <path>", "Snapshot output directory", DEFAULT_SPECS_DIR)
214
- .requiredOption("--query <text>", "Search query")
217
+ .option("--query <text>", "Semantic search query")
215
218
  .option("--output <path>", "Write search results to a file")
216
219
  .option("--types <items>", "Comma-separated filters: models,endpoints,components,modules,tasks")
220
+ .option("--verbose", "Show full grouped output instead of compact file-first format")
221
+ .option("--format <fmt>", "Output format for --query: text (default) or json (categorical)")
222
+ .option("--orient", "Return architecture-context.md as compact JSON (project map)")
223
+ .option("--file <path>", "Return context for a file path or endpoint (e.g. 'POST /api/auth/login')")
224
+ .option("--model <name>", "Return model fields, relationships, and usage (e.g. 'User')")
225
+ .option("--impact <path>", "Return impact analysis: what breaks if you change this file")
226
+ .option("--backend <backend>", "Storage backend: 'file' (default linear scan) or 'sqlite' (FTS5/BM25)")
217
227
  .action(async (options) => {
218
228
  await runSearch({
219
229
  input: options.input ?? DEFAULT_SPECS_DIR,
220
230
  query: options.query,
221
231
  output: options.output,
222
- types: options.types ? [options.types] : undefined
232
+ types: options.types ? [options.types] : undefined,
233
+ verbose: options.verbose ?? false,
234
+ format: options.format,
235
+ orient: options.orient ?? false,
236
+ file: options.file,
237
+ model: options.model,
238
+ impact: options.impact,
239
+ backend: options.backend,
223
240
  });
224
241
  });
225
242
  program
@@ -262,13 +279,16 @@ program
262
279
  });
263
280
  program
264
281
  .command("intel")
265
- .description("Build codebase-intelligence.json from existing snapshots")
282
+ .description("[deprecated] Use `guardian extract` instead")
266
283
  .option("--specs <dir>", "Snapshot output directory", DEFAULT_SPECS_DIR)
267
- .option("--output <path>", "Output path for codebase-intelligence.json")
284
+ .option("--output <path>", "Output path for codebase-intelligence.json (file backend only)")
285
+ .option("--backend <backend>", "Storage backend: 'file' (default) or 'sqlite'")
268
286
  .action(async (options) => {
287
+ console.warn("⚠ `guardian intel` is deprecated — use `guardian extract` instead.");
269
288
  await runIntel({
270
289
  specs: options.specs,
271
- output: options.output
290
+ output: options.output,
291
+ backend: options.backend,
272
292
  });
273
293
  });
274
294
  program
@@ -333,6 +353,7 @@ program
333
353
  .option("--frontend-root <path>", "Path to frontend root")
334
354
  .option("--output <path>", "Output directory", DEFAULT_SPECS_DIR)
335
355
  .option("--skip-hook", "Skip pre-commit hook installation", false)
356
+ .option("--backend <backend>", "Storage backend: 'file' (default) or 'sqlite' (builds guardian.db + FTS index)")
336
357
  .action(async (projectRoot, options) => {
337
358
  await runInit({
338
359
  projectRoot,
@@ -340,6 +361,28 @@ program
340
361
  frontendRoot: options.frontendRoot,
341
362
  output: options.output,
342
363
  skipHook: options.skipHook ?? false,
364
+ backend: options.backend,
365
+ });
366
+ });
367
+ program
368
+ .command("benchmark")
369
+ .description("Run Guardian-Bench offline evaluation suite (4 metrics, no LLM required)")
370
+ .requiredOption("--tasks <file>", "Path to JSONL tasks file")
371
+ .option("--specs <dir>", "Specs directory override for all tasks")
372
+ .option("--repo-dir <dir>", "Repo root directory override for all tasks")
373
+ .option("--output <path>", "Write report to file (in addition to stdout)")
374
+ .option("--format <fmt>", "Output format: text, json, markdown, csv (default: text)", "text")
375
+ .option("--k <n>", "k for precision/recall (default: 5)", "5")
376
+ .option("--concurrency <n>", "Max parallel tasks (default: 4)", "4")
377
+ .action(async (options) => {
378
+ await runBenchmarkCommand({
379
+ tasks: options.tasks,
380
+ specs: options.specs,
381
+ repoDir: options.repoDir,
382
+ output: options.output,
383
+ format: options.format,
384
+ k: options.k,
385
+ concurrency: options.concurrency,
343
386
  });
344
387
  });
345
388
  program
@@ -0,0 +1,62 @@
1
+ /**
2
+ * `guardian benchmark` — run Guardian-Bench offline evaluation suite
3
+ *
4
+ * Reads a JSONL file of tasks, computes 4 metrics per task (search recall,
5
+ * token efficiency, drift score, context coverage), and writes a report.
6
+ *
7
+ * Usage:
8
+ * guardian benchmark --tasks tasks.jsonl --specs .specs
9
+ * guardian benchmark --tasks tasks.jsonl --output results.json --format json
10
+ */
11
+ import fs from "node:fs/promises";
12
+ import path from "node:path";
13
+ import { runBenchmark } from "../benchmarking/runner.js";
14
+ import { renderReport, toCSV } from "../benchmarking/report.js";
15
+ export async function runBenchmarkCommand(options) {
16
+ const tasksFile = path.resolve(options.tasks);
17
+ const specsDir = options.specs ? path.resolve(options.specs) : undefined;
18
+ const repoDir = options.repoDir ? path.resolve(options.repoDir) : undefined;
19
+ const format = (options.format ?? "text");
20
+ const k = typeof options.k === "string" ? parseInt(options.k, 10) : (options.k ?? 5);
21
+ const concurrency = typeof options.concurrency === "string"
22
+ ? parseInt(options.concurrency, 10)
23
+ : (options.concurrency ?? 4);
24
+ // Validate tasks file
25
+ try {
26
+ await fs.access(tasksFile);
27
+ }
28
+ catch {
29
+ console.error(`Error: tasks file not found: ${tasksFile}`);
30
+ process.exit(1);
31
+ }
32
+ console.error(`Guardian-Bench: running tasks from ${tasksFile}`);
33
+ const summary = await runBenchmark({
34
+ tasksFile,
35
+ specsDir,
36
+ repoDir,
37
+ k,
38
+ concurrency,
39
+ onProgress(completed, total, result) {
40
+ const status = result.error ? "FAIL" : "OK";
41
+ const f1 = result.metrics.search_recall.f1_at_k.toFixed(3);
42
+ const cov = result.metrics.context_coverage.coverage.toFixed(3);
43
+ console.error(` [${completed}/${total}] ${status} ${result.task_id} | F1@${k}=${f1} | coverage=${cov}`);
44
+ },
45
+ });
46
+ // Render output
47
+ let output;
48
+ if (format === "csv") {
49
+ output = toCSV(summary);
50
+ }
51
+ else {
52
+ output = renderReport(summary, format === "json" || format === "markdown" ? format : "text");
53
+ }
54
+ if (options.output) {
55
+ const outputPath = path.resolve(options.output);
56
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
57
+ await fs.writeFile(outputPath, output, "utf8");
58
+ console.error(`Wrote results to ${outputPath}`);
59
+ }
60
+ // Always print to stdout
61
+ console.log(output);
62
+ }