npm - @gmickel/gno - Versions diffs - 1.4.0 → 1.4.1 - Mend

@gmickel/gno 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +2 -0
package/package.json +1 -1
package/src/bench/fixture.ts +247 -0
package/src/bench/metrics.ts +137 -0
package/src/bench/types.ts +96 -0
package/src/cli/commands/bench.ts +280 -0
package/src/cli/options.ts +2 -0
package/src/cli/program.ts +52 -0

package/README.md CHANGED Viewed

@@ -880,11 +880,13 @@ bun run lint && bun run typecheck
 Use retrieval benchmark commands to track quality and latency over time:
 ```bash
+gno bench docs/examples/bench-fixture.json
 bun run eval:hybrid
 bun run eval:hybrid:baseline
 bun run eval:hybrid:delta
 ```
+- Public fixture runner: `gno bench <fixture.json>` reports Precision@K, Recall@K, F1@K, MRR, nDCG@K, and latency across BM25/vector/hybrid modes.
 - Benchmark guide: [evals/README.md](./evals/README.md)
 - Latest baseline snapshot: [evals/fixtures/hybrid-baseline/latest.json](./evals/fixtures/hybrid-baseline/latest.json)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@gmickel/gno",
-  "version": "1.4.0",
+  "version": "1.4.1",
   "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
   "keywords": [
     "embeddings",

package/src/bench/fixture.ts ADDED Viewed

@@ -0,0 +1,247 @@
+import { z } from "zod";
+import type { BenchFixture, BenchMode, BenchOptions } from "./types";
+const MODE_ALIASES = [
+  "bm25",
+  "vector",
+  "hybrid",
+  "fast",
+  "no-rerank",
+  "thorough",
+] as const;
+type BenchModeAlias = (typeof MODE_ALIASES)[number];
+const queryModeInputSchema = z.object({
+  mode: z.enum(["term", "intent", "hyde"]),
+  text: z.string().trim().min(1),
+});
+const modeObjectSchema = z.object({
+  name: z.string().trim().min(1).optional(),
+  type: z.enum(["bm25", "vector", "hybrid"]).optional(),
+  mode: z.enum(MODE_ALIASES).optional(),
+  noExpand: z.boolean().optional(),
+  noRerank: z.boolean().optional(),
+  candidateLimit: z.number().int().positive().optional(),
+  limit: z.number().int().positive().optional(),
+  queryModes: z.array(queryModeInputSchema).optional(),
+});
+const fixtureSchema = z.object({
+  version: z.literal(1),
+  metadata: z
+    .object({
+      name: z.string().optional(),
+      description: z.string().optional(),
+      tags: z.array(z.string()).optional(),
+    })
+    .optional(),
+  collection: z.string().trim().min(1).optional(),
+  topK: z.number().int().positive().optional(),
+  candidateLimit: z.number().int().positive().optional(),
+  modes: z.array(z.union([z.enum(MODE_ALIASES), modeObjectSchema])).optional(),
+  queries: z
+    .array(
+      z.object({
+        id: z.string().trim().min(1),
+        query: z.string().trim().min(1),
+        expected: z.array(z.string().trim().min(1)).optional(),
+        expectedDocuments: z.array(z.string().trim().min(1)).optional(),
+        expectedUris: z.array(z.string().trim().min(1)).optional(),
+        judgments: z
+          .array(
+            z.object({
+              docid: z.string().trim().min(1).optional(),
+              doc: z.string().trim().min(1).optional(),
+              uri: z.string().trim().min(1).optional(),
+              relevance: z.number().min(0),
+            })
+          )
+          .optional(),
+        collection: z.string().trim().min(1).optional(),
+        topK: z.number().int().positive().optional(),
+        queryModes: z.array(queryModeInputSchema).optional(),
+      })
+    )
+    .min(1),
+});
+type FixtureModeInput = NonNullable<
+  z.infer<typeof fixtureSchema>["modes"]
+>[number];
+export function normalizeBenchRef(value: string): string {
+  const trimmed = value.trim();
+  const queryIndex = trimmed.indexOf("?");
+  return queryIndex === -1 ? trimmed : trimmed.slice(0, queryIndex);
+}
+function normalizeMode(alias: BenchModeAlias): BenchMode {
+  switch (alias) {
+    case "bm25":
+      return { name: "bm25", type: "bm25" };
+    case "vector":
+      return { name: "vector", type: "vector" };
+    case "fast":
+      return {
+        name: "fast",
+        type: "hybrid",
+        noExpand: true,
+        noRerank: true,
+      };
+    case "no-rerank":
+      return { name: "no-rerank", type: "hybrid", noRerank: true };
+    case "thorough":
+      return { name: "thorough", type: "hybrid", depth: "thorough" };
+    case "hybrid":
+      return { name: "hybrid", type: "hybrid" };
+  }
+}
+function normalizeModeInput(input: FixtureModeInput): BenchMode {
+  if (typeof input === "string") {
+    return normalizeMode(input as BenchModeAlias);
+  }
+  const base = input.mode ? normalizeMode(input.mode) : undefined;
+  const type = input.type ?? base?.type ?? "hybrid";
+  const name = input.name ?? input.mode ?? type;
+  return {
+    ...base,
+    name,
+    type,
+    depth: base?.depth,
+    noExpand: input.noExpand ?? base?.noExpand,
+    noRerank: input.noRerank ?? base?.noRerank,
+    candidateLimit: input.candidateLimit,
+    limit: input.limit,
+    queryModes: input.queryModes,
+  };
+}
+function parseModeFlag(
+  mode: string
+): { ok: true; value: BenchMode } | { ok: false; error: string } {
+  const normalized = mode.trim() as BenchModeAlias;
+  if (!MODE_ALIASES.includes(normalized)) {
+    return {
+      ok: false,
+      error: `Unsupported bench mode: ${mode}. Supported: ${MODE_ALIASES.join(", ")}`,
+    };
+  }
+  return { ok: true, value: normalizeMode(normalized) };
+}
+function normalizeModes(
+  fixtureModes: z.infer<typeof fixtureSchema>["modes"],
+  optionModes?: string[]
+): BenchMode[] {
+  if (optionModes?.length) {
+    return optionModes.map((mode) => {
+      const parsed = parseModeFlag(mode);
+      if (!parsed.ok) {
+        throw new Error(parsed.error);
+      }
+      return parsed.value;
+    });
+  }
+  return (fixtureModes ?? ["bm25"]).map(normalizeModeInput);
+}
+function normalizeFixture(
+  parsed: z.infer<typeof fixtureSchema>,
+  options: BenchOptions
+): BenchFixture {
+  const modes = normalizeModes(parsed.modes, options.modes);
+  const topK = options.topK ?? parsed.topK ?? 10;
+  const candidateLimit = options.candidateLimit ?? parsed.candidateLimit;
+  return {
+    version: parsed.version,
+    metadata: parsed.metadata,
+    collection: options.collection ?? parsed.collection,
+    topK,
+    candidateLimit,
+    modes,
+    queries: parsed.queries.map((entry) => {
+      const explicitExpected = [
+        ...(entry.expected ?? []),
+        ...(entry.expectedDocuments ?? []),
+        ...(entry.expectedUris ?? []),
+      ].map(normalizeBenchRef);
+      const judgments =
+        entry.judgments?.flatMap((judgment) => {
+          const docid = judgment.docid ?? judgment.doc ?? judgment.uri;
+          return docid
+            ? [
+                {
+                  docid: normalizeBenchRef(docid),
+                  relevance: judgment.relevance,
+                },
+              ]
+            : [];
+        }) ?? [];
+      const expected =
+        explicitExpected.length > 0
+          ? explicitExpected
+          : judgments.map((judgment) => judgment.docid);
+      return {
+        id: entry.id,
+        query: entry.query,
+        expected,
+        judgments,
+        collection: options.collection ?? entry.collection ?? parsed.collection,
+        topK: entry.topK,
+        queryModes: entry.queryModes,
+      };
+    }),
+  };
+}
+export async function loadBenchFixture(
+  fixturePath: string,
+  options: BenchOptions
+): Promise<{ ok: true; fixture: BenchFixture } | { ok: false; error: string }> {
+  const file = Bun.file(fixturePath);
+  if (!(await file.exists())) {
+    return { ok: false, error: `Fixture not found: ${fixturePath}` };
+  }
+  let raw: unknown;
+  try {
+    raw = JSON.parse(await file.text());
+  } catch (error) {
+    return {
+      ok: false,
+      error: `Invalid JSON fixture: ${error instanceof Error ? error.message : String(error)}`,
+    };
+  }
+  const parsed = fixtureSchema.safeParse(raw);
+  if (!parsed.success) {
+    return { ok: false, error: z.prettifyError(parsed.error) };
+  }
+  try {
+    const fixture = normalizeFixture(parsed.data, options);
+    const missingExpected = fixture.queries.find(
+      (entry) => entry.expected.length === 0
+    );
+    if (missingExpected) {
+      return {
+        ok: false,
+        error: `Bench query "${missingExpected.id}" must define expected documents, expected URIs, or judgments`,
+      };
+    }
+    return { ok: true, fixture };
+  } catch (error) {
+    return {
+      ok: false,
+      error: error instanceof Error ? error.message : String(error),
+    };
+  }
+}

package/src/bench/metrics.ts ADDED Viewed

@@ -0,0 +1,137 @@
+/**
+ * Retrieval benchmark metric helpers.
+ *
+ * @module src/bench/metrics
+ */
+export interface RelevanceJudgment {
+  docid: string;
+  relevance: number;
+}
+export interface RetrievalMetrics {
+  precisionAtK: number;
+  recallAtK: number;
+  f1AtK: number;
+  mrr: number;
+  ndcgAtK: number;
+}
+function round(value: number, places = 4): number {
+  return Number(value.toFixed(places));
+}
+/**
+ * Compute Precision@K: fraction of retrieved top-K docs that are relevant.
+ */
+export function computePrecision(
+  output: string[],
+  expected: string[],
+  k: number
+): number {
+  if (k <= 0) {
+    return 0;
+  }
+  const expectedSet = new Set(expected);
+  const hits = output.slice(0, k).filter((docid) => expectedSet.has(docid));
+  return hits.length / k;
+}
+/**
+ * Compute Recall@K: fraction of relevant docs in top K results.
+ */
+export function computeRecall(
+  output: string[],
+  expected: string[],
+  k: number
+): number {
+  if (expected.length === 0) return 1;
+  const topK = output.slice(0, k);
+  const hits = expected.filter((docid) => topK.includes(docid)).length;
+  return hits / expected.length;
+}
+/**
+ * Compute F1@K from precision and recall.
+ */
+export function computeF1(precision: number, recall: number): number {
+  if (precision === 0 && recall === 0) {
+    return 0;
+  }
+  return (2 * precision * recall) / (precision + recall);
+}
+/**
+ * Compute nDCG@K: normalized discounted cumulative gain.
+ */
+export function computeNdcg(
+  output: string[],
+  judgments: RelevanceJudgment[],
+  k: number
+): number {
+  if (judgments.length === 0) return 1;
+  const relMap = new Map(judgments.map((j) => [j.docid, j.relevance]));
+  const dcg = output.slice(0, k).reduce((sum, docid, i) => {
+    const rel = relMap.get(docid) ?? 0;
+    return sum + (2 ** rel - 1) / Math.log2(i + 2);
+  }, 0);
+  const idcg = [...judgments]
+    .sort((a, b) => b.relevance - a.relevance)
+    .slice(0, k)
+    .reduce((sum, j, i) => sum + (2 ** j.relevance - 1) / Math.log2(i + 2), 0);
+  return idcg > 0 ? dcg / idcg : 1;
+}
+/**
+ * Compute Mean Reciprocal Rank (single-query form).
+ * Returns reciprocal rank of first relevant hit in output.
+ */
+export function computeMrr(output: string[], expected: string[]): number {
+  if (expected.length === 0) {
+    return 1;
+  }
+  const expectedSet = new Set(expected);
+  for (const [index, docid] of output.entries()) {
+    if (expectedSet.has(docid)) {
+      return 1 / (index + 1);
+    }
+  }
+  return 0;
+}
+export function computeRetrievalMetrics(input: {
+  output: string[];
+  expected: string[];
+  judgments: RelevanceJudgment[];
+  k: number;
+}): RetrievalMetrics {
+  const precision = computePrecision(input.output, input.expected, input.k);
+  const recall = computeRecall(input.output, input.expected, input.k);
+  const judgmentSource =
+    input.judgments.length > 0
+      ? input.judgments
+      : input.expected.map((docid) => ({ docid, relevance: 1 }));
+  return {
+    precisionAtK: round(precision),
+    recallAtK: round(recall),
+    f1AtK: round(computeF1(precision, recall)),
+    mrr: round(computeMrr(input.output, input.expected)),
+    ndcgAtK: round(computeNdcg(input.output, judgmentSource, input.k)),
+  };
+}
+export function averageMetrics(metrics: RetrievalMetrics[]): RetrievalMetrics {
+  const average = (values: number[]): number =>
+    values.length === 0
+      ? 0
+      : values.reduce((sum, value) => sum + value, 0) / values.length;
+  return {
+    precisionAtK: round(average(metrics.map((m) => m.precisionAtK))),
+    recallAtK: round(average(metrics.map((m) => m.recallAtK))),
+    f1AtK: round(average(metrics.map((m) => m.f1AtK))),
+    mrr: round(average(metrics.map((m) => m.mrr))),
+    ndcgAtK: round(average(metrics.map((m) => m.ndcgAtK))),
+  };
+}

package/src/bench/types.ts ADDED Viewed

@@ -0,0 +1,96 @@
+import type { QueryModeInput } from "../pipeline/types";
+import type { RelevanceJudgment, RetrievalMetrics } from "./metrics";
+export type BenchModeType = "bm25" | "vector" | "hybrid";
+export interface BenchMode {
+  name: string;
+  type: BenchModeType;
+  depth?: "thorough";
+  noExpand?: boolean;
+  noRerank?: boolean;
+  candidateLimit?: number;
+  limit?: number;
+  queryModes?: QueryModeInput[];
+}
+export interface BenchCase {
+  id: string;
+  query: string;
+  expected: string[];
+  judgments: RelevanceJudgment[];
+  collection?: string;
+  topK?: number;
+  queryModes?: QueryModeInput[];
+}
+export interface BenchFixture {
+  version: 1;
+  metadata?: {
+    name?: string;
+    description?: string;
+    tags?: string[];
+  };
+  collection?: string;
+  topK: number;
+  candidateLimit?: number;
+  modes: BenchMode[];
+  queries: BenchCase[];
+}
+export interface BenchOptions {
+  configPath?: string;
+  indexName?: string;
+  collection?: string;
+  topK?: number;
+  candidateLimit?: number;
+  modes?: string[];
+  json?: boolean;
+}
+export interface BenchCaseResult {
+  id: string;
+  query: string;
+  topK: number;
+  expected: string[];
+  hits: string[];
+  topDocs: string[];
+  metrics: RetrievalMetrics;
+  latencyMs: number;
+  error?: string;
+}
+export interface BenchModeResult {
+  name: string;
+  type: BenchModeType;
+  status: "ok" | "failed";
+  queryCount: number;
+  failures: number;
+  metrics: RetrievalMetrics;
+  latency: {
+    p50Ms: number;
+    p95Ms: number;
+    meanMs: number;
+  };
+  cases: BenchCaseResult[];
+}
+export interface BenchOutput {
+  fixture: {
+    path: string;
+    name?: string;
+    version: 1;
+    queryCount: number;
+    topK: number;
+  };
+  generatedAt: string;
+  modes: BenchModeResult[];
+  meta: {
+    indexName: string;
+    collection?: string;
+  };
+}
+export type BenchResult =
+  | { success: true; data: BenchOutput }
+  | { success: false; error: string; isValidation?: boolean };

package/src/cli/commands/bench.ts ADDED Viewed

@@ -0,0 +1,280 @@
+/**
+ * gno bench command implementation.
+ * Runs retrieval benchmarks from user fixtures.
+ *
+ * @module src/cli/commands/bench
+ */
+import type {
+  BenchCase,
+  BenchCaseResult,
+  BenchMode,
+  BenchModeResult,
+  BenchOptions,
+  BenchResult,
+} from "../../bench/types";
+import type { SearchResult } from "../../pipeline/types";
+import { loadBenchFixture, normalizeBenchRef } from "../../bench/fixture";
+import { averageMetrics, computeRetrievalMetrics } from "../../bench/metrics";
+import { DEFAULT_THOROUGH_CANDIDATE_LIMIT } from "../../core/depth-policy";
+import { query } from "./query";
+import { search } from "./search";
+import { vsearch } from "./vsearch";
+function round(value: number, places = 2): number {
+  return Number(value.toFixed(places));
+}
+function summarizeLatency(values: number[]): BenchModeResult["latency"] {
+  if (values.length === 0) {
+    return { p50Ms: 0, p95Ms: 0, meanMs: 0 };
+  }
+  const sorted = [...values].sort((a, b) => a - b);
+  const percentile = (p: number): number => {
+    const index = Math.ceil((p / 100) * sorted.length) - 1;
+    return sorted[Math.max(0, Math.min(sorted.length - 1, index))] ?? 0;
+  };
+  return {
+    p50Ms: round(percentile(50)),
+    p95Ms: round(percentile(95)),
+    meanMs: round(
+      values.reduce((sum, value) => sum + value, 0) / values.length
+    ),
+  };
+}
+function resultRefs(result: SearchResult): Set<string> {
+  return new Set(
+    [
+      result.docid,
+      result.uri,
+      normalizeBenchRef(result.uri),
+      result.source.relPath,
+      result.title,
+    ].filter((value): value is string => Boolean(value))
+  );
+}
+function findHits(
+  results: SearchResult[],
+  expected: string[],
+  k: number
+): string[] {
+  const hits: string[] = [];
+  const expectedSet = new Set(expected.map(normalizeBenchRef));
+  for (const result of results.slice(0, k)) {
+    const refs = resultRefs(result);
+    const hit = [...expectedSet].find((expectedRef) => refs.has(expectedRef));
+    if (hit && !hits.includes(hit)) {
+      hits.push(hit);
+    }
+  }
+  return hits;
+}
+function topDocs(results: SearchResult[]): string[] {
+  return results.map((result) => result.source.relPath);
+}
+function rankedMetricDocs(
+  results: SearchResult[],
+  expected: string[]
+): string[] {
+  const expectedSet = new Set(expected.map(normalizeBenchRef));
+  return results.map((result) => {
+    const refs = resultRefs(result);
+    return (
+      [...expectedSet].find((expectedRef) => refs.has(expectedRef)) ??
+      result.source.relPath
+    );
+  });
+}
+async function runModeCase(input: {
+  mode: BenchMode;
+  benchCase: BenchCase;
+  topK: number;
+  candidateLimit?: number;
+  options: BenchOptions;
+}): Promise<BenchCaseResult> {
+  const { mode, benchCase, topK, options } = input;
+  const limit = mode.limit ?? topK;
+  const candidateLimit =
+    mode.candidateLimit ??
+    input.candidateLimit ??
+    (mode.depth === "thorough" ? DEFAULT_THOROUGH_CANDIDATE_LIMIT : undefined);
+  const startedAt = performance.now();
+  const queryModes = benchCase.queryModes ?? mode.queryModes;
+  let result:
+    | Awaited<ReturnType<typeof search>>
+    | Awaited<ReturnType<typeof vsearch>>
+    | Awaited<ReturnType<typeof query>>;
+  if (mode.type === "bm25") {
+    result = await search(benchCase.query, {
+      configPath: options.configPath,
+      indexName: options.indexName,
+      collection: benchCase.collection,
+      limit,
+      json: true,
+    });
+  } else if (mode.type === "vector") {
+    result = await vsearch(benchCase.query, {
+      configPath: options.configPath,
+      indexName: options.indexName,
+      collection: benchCase.collection,
+      limit,
+      json: true,
+    });
+  } else {
+    result = await query(benchCase.query, {
+      configPath: options.configPath,
+      indexName: options.indexName,
+      collection: benchCase.collection,
+      limit,
+      candidateLimit,
+      noExpand: mode.noExpand,
+      noRerank: mode.noRerank,
+      queryModes,
+      json: true,
+    });
+  }
+  const latencyMs = round(performance.now() - startedAt);
+  if (!result.success) {
+    return {
+      id: benchCase.id,
+      query: benchCase.query,
+      topK,
+      expected: benchCase.expected,
+      hits: [],
+      topDocs: [],
+      metrics: computeRetrievalMetrics({
+        output: [],
+        expected: benchCase.expected,
+        judgments: benchCase.judgments,
+        k: topK,
+      }),
+      latencyMs,
+      error: result.error,
+    };
+  }
+  const docs = topDocs(result.data.results);
+  const metricDocs = rankedMetricDocs(result.data.results, benchCase.expected);
+  const hits = findHits(result.data.results, benchCase.expected, topK);
+  return {
+    id: benchCase.id,
+    query: benchCase.query,
+    topK,
+    expected: benchCase.expected,
+    hits,
+    topDocs: docs.slice(0, topK),
+    metrics: computeRetrievalMetrics({
+      output: metricDocs,
+      expected: benchCase.expected,
+      judgments: benchCase.judgments,
+      k: topK,
+    }),
+    latencyMs,
+  };
+}
+/**
+ * Execute gno bench command.
+ */
+export async function bench(
+  fixturePath: string,
+  options: BenchOptions = {}
+): Promise<BenchResult> {
+  const loaded = await loadBenchFixture(fixturePath, options);
+  if (!loaded.ok) {
+    return { success: false, error: loaded.error, isValidation: true };
+  }
+  const { fixture } = loaded;
+  const modeResults: BenchModeResult[] = [];
+  for (const mode of fixture.modes) {
+    const cases: BenchCaseResult[] = [];
+    for (const benchCase of fixture.queries) {
+      const topK = benchCase.topK ?? fixture.topK;
+      cases.push(
+        await runModeCase({
+          mode,
+          benchCase,
+          topK,
+          candidateLimit: fixture.candidateLimit,
+          options,
+        })
+      );
+    }
+    const failures = cases.filter((entry) => entry.error).length;
+    modeResults.push({
+      name: mode.name,
+      type: mode.type,
+      status: failures === cases.length ? "failed" : "ok",
+      queryCount: cases.length,
+      failures,
+      metrics: averageMetrics(cases.map((entry) => entry.metrics)),
+      latency: summarizeLatency(cases.map((entry) => entry.latencyMs)),
+      cases,
+    });
+  }
+  return {
+    success: true,
+    data: {
+      fixture: {
+        path: fixturePath,
+        name: fixture.metadata?.name,
+        version: fixture.version,
+        queryCount: fixture.queries.length,
+        topK: fixture.topK,
+      },
+      generatedAt: new Date().toISOString(),
+      modes: modeResults,
+      meta: {
+        indexName: options.indexName ?? "default",
+        collection: fixture.collection,
+      },
+    },
+  };
+}
+export function formatBench(
+  result: BenchResult,
+  options: { json?: boolean }
+): string {
+  if (!result.success) {
+    return options.json
+      ? JSON.stringify({
+          error: { code: "BENCH_FAILED", message: result.error },
+        })
+      : `Error: ${result.error}`;
+  }
+  if (options.json) {
+    return JSON.stringify(result.data, null, 2);
+  }
+  const lines = [
+    `Bench: ${result.data.fixture.name ?? result.data.fixture.path}`,
+    `Queries: ${result.data.fixture.queryCount}  Top K: ${result.data.fixture.topK}`,
+    "",
+    "| Mode | Status | Precision@K | Recall@K | F1@K | MRR | nDCG@K | p95 ms | Failures |",
+    "| ---- | ------ | ----------- | -------- | ---- | --- | ------ | ------ | -------- |",
+  ];
+  for (const mode of result.data.modes) {
+    lines.push(
+      `| ${mode.name} | ${mode.status} | ${mode.metrics.precisionAtK.toFixed(3)} | ${mode.metrics.recallAtK.toFixed(3)} | ${mode.metrics.f1AtK.toFixed(3)} | ${mode.metrics.mrr.toFixed(3)} | ${mode.metrics.ndcgAtK.toFixed(3)} | ${mode.latency.p95Ms.toFixed(2)} | ${mode.failures} |`
+    );
+  }
+  return lines.join("\n");
+}

package/src/cli/options.ts CHANGED Viewed

@@ -22,6 +22,7 @@ export const CMD = {
   search: "search",
   vsearch: "vsearch",
   query: "query",
+  bench: "bench",
   ask: "ask",
   get: "get",
   multiGet: "multi-get",
@@ -45,6 +46,7 @@ const FORMAT_SUPPORT: Record<CommandId, OutputFormat[]> = {
   [CMD.search]: ["terminal", "json", "files", "csv", "md", "xml"],
   [CMD.vsearch]: ["terminal", "json", "files", "csv", "md", "xml"],
   [CMD.query]: ["terminal", "json", "files", "csv", "md", "xml"],
+  [CMD.bench]: ["terminal", "json"],
   [CMD.ask]: ["terminal", "json", "md"],
   [CMD.get]: ["terminal", "json", "md"],
   [CMD.multiGet]: ["terminal", "json", "files", "md"],

package/src/cli/program.ts CHANGED Viewed

@@ -677,6 +677,58 @@ function wireSearchCommands(program: Command): void {
       await writeOutput(output, format);
     });
+  // bench - Retrieval benchmark fixture runner
+  program
+    .command("bench <fixture>")
+    .description("Run retrieval quality benchmarks from a fixture")
+    .option("-c, --collection <name>", "override fixture collection")
+    .option("-k, --top-k <num>", "override top-k metric cutoff")
+    .option(
+      "--mode <name>",
+      "benchmark mode (repeatable): bm25, vector, hybrid, fast, no-rerank, thorough",
+      (value: string, previous: string[] = []) => [...previous, value],
+      []
+    )
+    .option("-C, --candidate-limit <num>", "max candidates passed to reranking")
+    .option("--json", "JSON output")
+    .action(async (fixture: string, cmdOpts: Record<string, unknown>) => {
+      const format = getFormat(cmdOpts);
+      assertFormatSupported(CMD.bench, format);
+      const globals = getGlobals();
+      const topK = cmdOpts.topK
+        ? parsePositiveInt("top-k", cmdOpts.topK)
+        : undefined;
+      const candidateLimit = cmdOpts.candidateLimit
+        ? parsePositiveInt("candidate-limit", cmdOpts.candidateLimit)
+        : undefined;
+      const { bench, formatBench } = await import("./commands/bench");
+      const result = await bench(fixture, {
+        configPath: globals.config,
+        indexName: globals.index,
+        collection: cmdOpts.collection as string | undefined,
+        topK,
+        candidateLimit,
+        modes:
+          Array.isArray(cmdOpts.mode) && cmdOpts.mode.length > 0
+            ? (cmdOpts.mode as string[])
+            : undefined,
+        json: format === "json",
+      });
+      if (!result.success) {
+        throw new CliError(
+          result.isValidation ? "VALIDATION" : "RUNTIME",
+          result.error
+        );
+      }
+      await writeOutput(
+        formatBench(result, { json: format === "json" }),
+        format
+      );
+    });
   // ask - Human-friendly query with grounded answer
   program
     .command("ask <query>")