@gmickel/gno 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -880,11 +880,13 @@ bun run lint && bun run typecheck
880
880
  Use retrieval benchmark commands to track quality and latency over time:
881
881
 
882
882
  ```bash
883
+ gno bench docs/examples/bench-fixture.json
883
884
  bun run eval:hybrid
884
885
  bun run eval:hybrid:baseline
885
886
  bun run eval:hybrid:delta
886
887
  ```
887
888
 
889
+ - Public fixture runner: `gno bench <fixture.json>` reports Precision@K, Recall@K, F1@K, MRR, nDCG@K, and latency across BM25/vector/hybrid modes.
888
890
  - Benchmark guide: [evals/README.md](./evals/README.md)
889
891
  - Latest baseline snapshot: [evals/fixtures/hybrid-baseline/latest.json](./evals/fixtures/hybrid-baseline/latest.json)
890
892
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gmickel/gno",
3
- "version": "1.4.0",
3
+ "version": "1.4.1",
4
4
  "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
5
5
  "keywords": [
6
6
  "embeddings",
@@ -0,0 +1,247 @@
1
+ import { z } from "zod";
2
+
3
+ import type { BenchFixture, BenchMode, BenchOptions } from "./types";
4
+
5
+ const MODE_ALIASES = [
6
+ "bm25",
7
+ "vector",
8
+ "hybrid",
9
+ "fast",
10
+ "no-rerank",
11
+ "thorough",
12
+ ] as const;
13
+
14
+ type BenchModeAlias = (typeof MODE_ALIASES)[number];
15
+
16
+ const queryModeInputSchema = z.object({
17
+ mode: z.enum(["term", "intent", "hyde"]),
18
+ text: z.string().trim().min(1),
19
+ });
20
+
21
+ const modeObjectSchema = z.object({
22
+ name: z.string().trim().min(1).optional(),
23
+ type: z.enum(["bm25", "vector", "hybrid"]).optional(),
24
+ mode: z.enum(MODE_ALIASES).optional(),
25
+ noExpand: z.boolean().optional(),
26
+ noRerank: z.boolean().optional(),
27
+ candidateLimit: z.number().int().positive().optional(),
28
+ limit: z.number().int().positive().optional(),
29
+ queryModes: z.array(queryModeInputSchema).optional(),
30
+ });
31
+
32
+ const fixtureSchema = z.object({
33
+ version: z.literal(1),
34
+ metadata: z
35
+ .object({
36
+ name: z.string().optional(),
37
+ description: z.string().optional(),
38
+ tags: z.array(z.string()).optional(),
39
+ })
40
+ .optional(),
41
+ collection: z.string().trim().min(1).optional(),
42
+ topK: z.number().int().positive().optional(),
43
+ candidateLimit: z.number().int().positive().optional(),
44
+ modes: z.array(z.union([z.enum(MODE_ALIASES), modeObjectSchema])).optional(),
45
+ queries: z
46
+ .array(
47
+ z.object({
48
+ id: z.string().trim().min(1),
49
+ query: z.string().trim().min(1),
50
+ expected: z.array(z.string().trim().min(1)).optional(),
51
+ expectedDocuments: z.array(z.string().trim().min(1)).optional(),
52
+ expectedUris: z.array(z.string().trim().min(1)).optional(),
53
+ judgments: z
54
+ .array(
55
+ z.object({
56
+ docid: z.string().trim().min(1).optional(),
57
+ doc: z.string().trim().min(1).optional(),
58
+ uri: z.string().trim().min(1).optional(),
59
+ relevance: z.number().min(0),
60
+ })
61
+ )
62
+ .optional(),
63
+ collection: z.string().trim().min(1).optional(),
64
+ topK: z.number().int().positive().optional(),
65
+ queryModes: z.array(queryModeInputSchema).optional(),
66
+ })
67
+ )
68
+ .min(1),
69
+ });
70
+
71
+ type FixtureModeInput = NonNullable<
72
+ z.infer<typeof fixtureSchema>["modes"]
73
+ >[number];
74
+
75
+ export function normalizeBenchRef(value: string): string {
76
+ const trimmed = value.trim();
77
+ const queryIndex = trimmed.indexOf("?");
78
+ return queryIndex === -1 ? trimmed : trimmed.slice(0, queryIndex);
79
+ }
80
+
81
+ function normalizeMode(alias: BenchModeAlias): BenchMode {
82
+ switch (alias) {
83
+ case "bm25":
84
+ return { name: "bm25", type: "bm25" };
85
+ case "vector":
86
+ return { name: "vector", type: "vector" };
87
+ case "fast":
88
+ return {
89
+ name: "fast",
90
+ type: "hybrid",
91
+ noExpand: true,
92
+ noRerank: true,
93
+ };
94
+ case "no-rerank":
95
+ return { name: "no-rerank", type: "hybrid", noRerank: true };
96
+ case "thorough":
97
+ return { name: "thorough", type: "hybrid", depth: "thorough" };
98
+ case "hybrid":
99
+ return { name: "hybrid", type: "hybrid" };
100
+ }
101
+ }
102
+
103
+ function normalizeModeInput(input: FixtureModeInput): BenchMode {
104
+ if (typeof input === "string") {
105
+ return normalizeMode(input as BenchModeAlias);
106
+ }
107
+
108
+ const base = input.mode ? normalizeMode(input.mode) : undefined;
109
+ const type = input.type ?? base?.type ?? "hybrid";
110
+ const name = input.name ?? input.mode ?? type;
111
+ return {
112
+ ...base,
113
+ name,
114
+ type,
115
+ depth: base?.depth,
116
+ noExpand: input.noExpand ?? base?.noExpand,
117
+ noRerank: input.noRerank ?? base?.noRerank,
118
+ candidateLimit: input.candidateLimit,
119
+ limit: input.limit,
120
+ queryModes: input.queryModes,
121
+ };
122
+ }
123
+
124
+ function parseModeFlag(
125
+ mode: string
126
+ ): { ok: true; value: BenchMode } | { ok: false; error: string } {
127
+ const normalized = mode.trim() as BenchModeAlias;
128
+ if (!MODE_ALIASES.includes(normalized)) {
129
+ return {
130
+ ok: false,
131
+ error: `Unsupported bench mode: ${mode}. Supported: ${MODE_ALIASES.join(", ")}`,
132
+ };
133
+ }
134
+ return { ok: true, value: normalizeMode(normalized) };
135
+ }
136
+
137
+ function normalizeModes(
138
+ fixtureModes: z.infer<typeof fixtureSchema>["modes"],
139
+ optionModes?: string[]
140
+ ): BenchMode[] {
141
+ if (optionModes?.length) {
142
+ return optionModes.map((mode) => {
143
+ const parsed = parseModeFlag(mode);
144
+ if (!parsed.ok) {
145
+ throw new Error(parsed.error);
146
+ }
147
+ return parsed.value;
148
+ });
149
+ }
150
+
151
+ return (fixtureModes ?? ["bm25"]).map(normalizeModeInput);
152
+ }
153
+
154
+ function normalizeFixture(
155
+ parsed: z.infer<typeof fixtureSchema>,
156
+ options: BenchOptions
157
+ ): BenchFixture {
158
+ const modes = normalizeModes(parsed.modes, options.modes);
159
+ const topK = options.topK ?? parsed.topK ?? 10;
160
+ const candidateLimit = options.candidateLimit ?? parsed.candidateLimit;
161
+
162
+ return {
163
+ version: parsed.version,
164
+ metadata: parsed.metadata,
165
+ collection: options.collection ?? parsed.collection,
166
+ topK,
167
+ candidateLimit,
168
+ modes,
169
+ queries: parsed.queries.map((entry) => {
170
+ const explicitExpected = [
171
+ ...(entry.expected ?? []),
172
+ ...(entry.expectedDocuments ?? []),
173
+ ...(entry.expectedUris ?? []),
174
+ ].map(normalizeBenchRef);
175
+ const judgments =
176
+ entry.judgments?.flatMap((judgment) => {
177
+ const docid = judgment.docid ?? judgment.doc ?? judgment.uri;
178
+ return docid
179
+ ? [
180
+ {
181
+ docid: normalizeBenchRef(docid),
182
+ relevance: judgment.relevance,
183
+ },
184
+ ]
185
+ : [];
186
+ }) ?? [];
187
+ const expected =
188
+ explicitExpected.length > 0
189
+ ? explicitExpected
190
+ : judgments.map((judgment) => judgment.docid);
191
+
192
+ return {
193
+ id: entry.id,
194
+ query: entry.query,
195
+ expected,
196
+ judgments,
197
+ collection: options.collection ?? entry.collection ?? parsed.collection,
198
+ topK: entry.topK,
199
+ queryModes: entry.queryModes,
200
+ };
201
+ }),
202
+ };
203
+ }
204
+
205
+ export async function loadBenchFixture(
206
+ fixturePath: string,
207
+ options: BenchOptions
208
+ ): Promise<{ ok: true; fixture: BenchFixture } | { ok: false; error: string }> {
209
+ const file = Bun.file(fixturePath);
210
+ if (!(await file.exists())) {
211
+ return { ok: false, error: `Fixture not found: ${fixturePath}` };
212
+ }
213
+
214
+ let raw: unknown;
215
+ try {
216
+ raw = JSON.parse(await file.text());
217
+ } catch (error) {
218
+ return {
219
+ ok: false,
220
+ error: `Invalid JSON fixture: ${error instanceof Error ? error.message : String(error)}`,
221
+ };
222
+ }
223
+
224
+ const parsed = fixtureSchema.safeParse(raw);
225
+ if (!parsed.success) {
226
+ return { ok: false, error: z.prettifyError(parsed.error) };
227
+ }
228
+
229
+ try {
230
+ const fixture = normalizeFixture(parsed.data, options);
231
+ const missingExpected = fixture.queries.find(
232
+ (entry) => entry.expected.length === 0
233
+ );
234
+ if (missingExpected) {
235
+ return {
236
+ ok: false,
237
+ error: `Bench query "${missingExpected.id}" must define expected documents, expected URIs, or judgments`,
238
+ };
239
+ }
240
+ return { ok: true, fixture };
241
+ } catch (error) {
242
+ return {
243
+ ok: false,
244
+ error: error instanceof Error ? error.message : String(error),
245
+ };
246
+ }
247
+ }
@@ -0,0 +1,137 @@
1
+ /**
2
+ * Retrieval benchmark metric helpers.
3
+ *
4
+ * @module src/bench/metrics
5
+ */
6
+
7
+ export interface RelevanceJudgment {
8
+ docid: string;
9
+ relevance: number;
10
+ }
11
+
12
+ export interface RetrievalMetrics {
13
+ precisionAtK: number;
14
+ recallAtK: number;
15
+ f1AtK: number;
16
+ mrr: number;
17
+ ndcgAtK: number;
18
+ }
19
+
20
+ function round(value: number, places = 4): number {
21
+ return Number(value.toFixed(places));
22
+ }
23
+
24
+ /**
25
+ * Compute Precision@K: fraction of retrieved top-K docs that are relevant.
26
+ */
27
+ export function computePrecision(
28
+ output: string[],
29
+ expected: string[],
30
+ k: number
31
+ ): number {
32
+ if (k <= 0) {
33
+ return 0;
34
+ }
35
+ const expectedSet = new Set(expected);
36
+ const hits = output.slice(0, k).filter((docid) => expectedSet.has(docid));
37
+ return hits.length / k;
38
+ }
39
+
40
+ /**
41
+ * Compute Recall@K: fraction of relevant docs in top K results.
42
+ */
43
+ export function computeRecall(
44
+ output: string[],
45
+ expected: string[],
46
+ k: number
47
+ ): number {
48
+ if (expected.length === 0) return 1;
49
+ const topK = output.slice(0, k);
50
+ const hits = expected.filter((docid) => topK.includes(docid)).length;
51
+ return hits / expected.length;
52
+ }
53
+
54
+ /**
55
+ * Compute F1@K from precision and recall.
56
+ */
57
+ export function computeF1(precision: number, recall: number): number {
58
+ if (precision === 0 && recall === 0) {
59
+ return 0;
60
+ }
61
+ return (2 * precision * recall) / (precision + recall);
62
+ }
63
+
64
+ /**
65
+ * Compute nDCG@K: normalized discounted cumulative gain.
66
+ */
67
+ export function computeNdcg(
68
+ output: string[],
69
+ judgments: RelevanceJudgment[],
70
+ k: number
71
+ ): number {
72
+ if (judgments.length === 0) return 1;
73
+ const relMap = new Map(judgments.map((j) => [j.docid, j.relevance]));
74
+ const dcg = output.slice(0, k).reduce((sum, docid, i) => {
75
+ const rel = relMap.get(docid) ?? 0;
76
+ return sum + (2 ** rel - 1) / Math.log2(i + 2);
77
+ }, 0);
78
+ const idcg = [...judgments]
79
+ .sort((a, b) => b.relevance - a.relevance)
80
+ .slice(0, k)
81
+ .reduce((sum, j, i) => sum + (2 ** j.relevance - 1) / Math.log2(i + 2), 0);
82
+ return idcg > 0 ? dcg / idcg : 1;
83
+ }
84
+
85
+ /**
86
+ * Compute Mean Reciprocal Rank (single-query form).
87
+ * Returns reciprocal rank of first relevant hit in output.
88
+ */
89
+ export function computeMrr(output: string[], expected: string[]): number {
90
+ if (expected.length === 0) {
91
+ return 1;
92
+ }
93
+ const expectedSet = new Set(expected);
94
+ for (const [index, docid] of output.entries()) {
95
+ if (expectedSet.has(docid)) {
96
+ return 1 / (index + 1);
97
+ }
98
+ }
99
+ return 0;
100
+ }
101
+
102
+ export function computeRetrievalMetrics(input: {
103
+ output: string[];
104
+ expected: string[];
105
+ judgments: RelevanceJudgment[];
106
+ k: number;
107
+ }): RetrievalMetrics {
108
+ const precision = computePrecision(input.output, input.expected, input.k);
109
+ const recall = computeRecall(input.output, input.expected, input.k);
110
+ const judgmentSource =
111
+ input.judgments.length > 0
112
+ ? input.judgments
113
+ : input.expected.map((docid) => ({ docid, relevance: 1 }));
114
+
115
+ return {
116
+ precisionAtK: round(precision),
117
+ recallAtK: round(recall),
118
+ f1AtK: round(computeF1(precision, recall)),
119
+ mrr: round(computeMrr(input.output, input.expected)),
120
+ ndcgAtK: round(computeNdcg(input.output, judgmentSource, input.k)),
121
+ };
122
+ }
123
+
124
+ export function averageMetrics(metrics: RetrievalMetrics[]): RetrievalMetrics {
125
+ const average = (values: number[]): number =>
126
+ values.length === 0
127
+ ? 0
128
+ : values.reduce((sum, value) => sum + value, 0) / values.length;
129
+
130
+ return {
131
+ precisionAtK: round(average(metrics.map((m) => m.precisionAtK))),
132
+ recallAtK: round(average(metrics.map((m) => m.recallAtK))),
133
+ f1AtK: round(average(metrics.map((m) => m.f1AtK))),
134
+ mrr: round(average(metrics.map((m) => m.mrr))),
135
+ ndcgAtK: round(average(metrics.map((m) => m.ndcgAtK))),
136
+ };
137
+ }
@@ -0,0 +1,96 @@
1
+ import type { QueryModeInput } from "../pipeline/types";
2
+ import type { RelevanceJudgment, RetrievalMetrics } from "./metrics";
3
+
4
+ export type BenchModeType = "bm25" | "vector" | "hybrid";
5
+
6
+ export interface BenchMode {
7
+ name: string;
8
+ type: BenchModeType;
9
+ depth?: "thorough";
10
+ noExpand?: boolean;
11
+ noRerank?: boolean;
12
+ candidateLimit?: number;
13
+ limit?: number;
14
+ queryModes?: QueryModeInput[];
15
+ }
16
+
17
+ export interface BenchCase {
18
+ id: string;
19
+ query: string;
20
+ expected: string[];
21
+ judgments: RelevanceJudgment[];
22
+ collection?: string;
23
+ topK?: number;
24
+ queryModes?: QueryModeInput[];
25
+ }
26
+
27
+ export interface BenchFixture {
28
+ version: 1;
29
+ metadata?: {
30
+ name?: string;
31
+ description?: string;
32
+ tags?: string[];
33
+ };
34
+ collection?: string;
35
+ topK: number;
36
+ candidateLimit?: number;
37
+ modes: BenchMode[];
38
+ queries: BenchCase[];
39
+ }
40
+
41
+ export interface BenchOptions {
42
+ configPath?: string;
43
+ indexName?: string;
44
+ collection?: string;
45
+ topK?: number;
46
+ candidateLimit?: number;
47
+ modes?: string[];
48
+ json?: boolean;
49
+ }
50
+
51
+ export interface BenchCaseResult {
52
+ id: string;
53
+ query: string;
54
+ topK: number;
55
+ expected: string[];
56
+ hits: string[];
57
+ topDocs: string[];
58
+ metrics: RetrievalMetrics;
59
+ latencyMs: number;
60
+ error?: string;
61
+ }
62
+
63
+ export interface BenchModeResult {
64
+ name: string;
65
+ type: BenchModeType;
66
+ status: "ok" | "failed";
67
+ queryCount: number;
68
+ failures: number;
69
+ metrics: RetrievalMetrics;
70
+ latency: {
71
+ p50Ms: number;
72
+ p95Ms: number;
73
+ meanMs: number;
74
+ };
75
+ cases: BenchCaseResult[];
76
+ }
77
+
78
+ export interface BenchOutput {
79
+ fixture: {
80
+ path: string;
81
+ name?: string;
82
+ version: 1;
83
+ queryCount: number;
84
+ topK: number;
85
+ };
86
+ generatedAt: string;
87
+ modes: BenchModeResult[];
88
+ meta: {
89
+ indexName: string;
90
+ collection?: string;
91
+ };
92
+ }
93
+
94
+ export type BenchResult =
95
+ | { success: true; data: BenchOutput }
96
+ | { success: false; error: string; isValidation?: boolean };
@@ -0,0 +1,280 @@
1
+ /**
2
+ * gno bench command implementation.
3
+ * Runs retrieval benchmarks from user fixtures.
4
+ *
5
+ * @module src/cli/commands/bench
6
+ */
7
+
8
+ import type {
9
+ BenchCase,
10
+ BenchCaseResult,
11
+ BenchMode,
12
+ BenchModeResult,
13
+ BenchOptions,
14
+ BenchResult,
15
+ } from "../../bench/types";
16
+ import type { SearchResult } from "../../pipeline/types";
17
+
18
+ import { loadBenchFixture, normalizeBenchRef } from "../../bench/fixture";
19
+ import { averageMetrics, computeRetrievalMetrics } from "../../bench/metrics";
20
+ import { DEFAULT_THOROUGH_CANDIDATE_LIMIT } from "../../core/depth-policy";
21
+ import { query } from "./query";
22
+ import { search } from "./search";
23
+ import { vsearch } from "./vsearch";
24
+
25
+ function round(value: number, places = 2): number {
26
+ return Number(value.toFixed(places));
27
+ }
28
+
29
+ function summarizeLatency(values: number[]): BenchModeResult["latency"] {
30
+ if (values.length === 0) {
31
+ return { p50Ms: 0, p95Ms: 0, meanMs: 0 };
32
+ }
33
+ const sorted = [...values].sort((a, b) => a - b);
34
+ const percentile = (p: number): number => {
35
+ const index = Math.ceil((p / 100) * sorted.length) - 1;
36
+ return sorted[Math.max(0, Math.min(sorted.length - 1, index))] ?? 0;
37
+ };
38
+ return {
39
+ p50Ms: round(percentile(50)),
40
+ p95Ms: round(percentile(95)),
41
+ meanMs: round(
42
+ values.reduce((sum, value) => sum + value, 0) / values.length
43
+ ),
44
+ };
45
+ }
46
+
47
+ function resultRefs(result: SearchResult): Set<string> {
48
+ return new Set(
49
+ [
50
+ result.docid,
51
+ result.uri,
52
+ normalizeBenchRef(result.uri),
53
+ result.source.relPath,
54
+ result.title,
55
+ ].filter((value): value is string => Boolean(value))
56
+ );
57
+ }
58
+
59
+ function findHits(
60
+ results: SearchResult[],
61
+ expected: string[],
62
+ k: number
63
+ ): string[] {
64
+ const hits: string[] = [];
65
+ const expectedSet = new Set(expected.map(normalizeBenchRef));
66
+
67
+ for (const result of results.slice(0, k)) {
68
+ const refs = resultRefs(result);
69
+ const hit = [...expectedSet].find((expectedRef) => refs.has(expectedRef));
70
+ if (hit && !hits.includes(hit)) {
71
+ hits.push(hit);
72
+ }
73
+ }
74
+
75
+ return hits;
76
+ }
77
+
78
+ function topDocs(results: SearchResult[]): string[] {
79
+ return results.map((result) => result.source.relPath);
80
+ }
81
+
82
+ function rankedMetricDocs(
83
+ results: SearchResult[],
84
+ expected: string[]
85
+ ): string[] {
86
+ const expectedSet = new Set(expected.map(normalizeBenchRef));
87
+ return results.map((result) => {
88
+ const refs = resultRefs(result);
89
+ return (
90
+ [...expectedSet].find((expectedRef) => refs.has(expectedRef)) ??
91
+ result.source.relPath
92
+ );
93
+ });
94
+ }
95
+
96
+ async function runModeCase(input: {
97
+ mode: BenchMode;
98
+ benchCase: BenchCase;
99
+ topK: number;
100
+ candidateLimit?: number;
101
+ options: BenchOptions;
102
+ }): Promise<BenchCaseResult> {
103
+ const { mode, benchCase, topK, options } = input;
104
+ const limit = mode.limit ?? topK;
105
+ const candidateLimit =
106
+ mode.candidateLimit ??
107
+ input.candidateLimit ??
108
+ (mode.depth === "thorough" ? DEFAULT_THOROUGH_CANDIDATE_LIMIT : undefined);
109
+ const startedAt = performance.now();
110
+ const queryModes = benchCase.queryModes ?? mode.queryModes;
111
+ let result:
112
+ | Awaited<ReturnType<typeof search>>
113
+ | Awaited<ReturnType<typeof vsearch>>
114
+ | Awaited<ReturnType<typeof query>>;
115
+
116
+ if (mode.type === "bm25") {
117
+ result = await search(benchCase.query, {
118
+ configPath: options.configPath,
119
+ indexName: options.indexName,
120
+ collection: benchCase.collection,
121
+ limit,
122
+ json: true,
123
+ });
124
+ } else if (mode.type === "vector") {
125
+ result = await vsearch(benchCase.query, {
126
+ configPath: options.configPath,
127
+ indexName: options.indexName,
128
+ collection: benchCase.collection,
129
+ limit,
130
+ json: true,
131
+ });
132
+ } else {
133
+ result = await query(benchCase.query, {
134
+ configPath: options.configPath,
135
+ indexName: options.indexName,
136
+ collection: benchCase.collection,
137
+ limit,
138
+ candidateLimit,
139
+ noExpand: mode.noExpand,
140
+ noRerank: mode.noRerank,
141
+ queryModes,
142
+ json: true,
143
+ });
144
+ }
145
+
146
+ const latencyMs = round(performance.now() - startedAt);
147
+ if (!result.success) {
148
+ return {
149
+ id: benchCase.id,
150
+ query: benchCase.query,
151
+ topK,
152
+ expected: benchCase.expected,
153
+ hits: [],
154
+ topDocs: [],
155
+ metrics: computeRetrievalMetrics({
156
+ output: [],
157
+ expected: benchCase.expected,
158
+ judgments: benchCase.judgments,
159
+ k: topK,
160
+ }),
161
+ latencyMs,
162
+ error: result.error,
163
+ };
164
+ }
165
+
166
+ const docs = topDocs(result.data.results);
167
+ const metricDocs = rankedMetricDocs(result.data.results, benchCase.expected);
168
+ const hits = findHits(result.data.results, benchCase.expected, topK);
169
+ return {
170
+ id: benchCase.id,
171
+ query: benchCase.query,
172
+ topK,
173
+ expected: benchCase.expected,
174
+ hits,
175
+ topDocs: docs.slice(0, topK),
176
+ metrics: computeRetrievalMetrics({
177
+ output: metricDocs,
178
+ expected: benchCase.expected,
179
+ judgments: benchCase.judgments,
180
+ k: topK,
181
+ }),
182
+ latencyMs,
183
+ };
184
+ }
185
+
186
+ /**
187
+ * Execute gno bench command.
188
+ */
189
+ export async function bench(
190
+ fixturePath: string,
191
+ options: BenchOptions = {}
192
+ ): Promise<BenchResult> {
193
+ const loaded = await loadBenchFixture(fixturePath, options);
194
+ if (!loaded.ok) {
195
+ return { success: false, error: loaded.error, isValidation: true };
196
+ }
197
+
198
+ const { fixture } = loaded;
199
+ const modeResults: BenchModeResult[] = [];
200
+
201
+ for (const mode of fixture.modes) {
202
+ const cases: BenchCaseResult[] = [];
203
+ for (const benchCase of fixture.queries) {
204
+ const topK = benchCase.topK ?? fixture.topK;
205
+ cases.push(
206
+ await runModeCase({
207
+ mode,
208
+ benchCase,
209
+ topK,
210
+ candidateLimit: fixture.candidateLimit,
211
+ options,
212
+ })
213
+ );
214
+ }
215
+
216
+ const failures = cases.filter((entry) => entry.error).length;
217
+ modeResults.push({
218
+ name: mode.name,
219
+ type: mode.type,
220
+ status: failures === cases.length ? "failed" : "ok",
221
+ queryCount: cases.length,
222
+ failures,
223
+ metrics: averageMetrics(cases.map((entry) => entry.metrics)),
224
+ latency: summarizeLatency(cases.map((entry) => entry.latencyMs)),
225
+ cases,
226
+ });
227
+ }
228
+
229
+ return {
230
+ success: true,
231
+ data: {
232
+ fixture: {
233
+ path: fixturePath,
234
+ name: fixture.metadata?.name,
235
+ version: fixture.version,
236
+ queryCount: fixture.queries.length,
237
+ topK: fixture.topK,
238
+ },
239
+ generatedAt: new Date().toISOString(),
240
+ modes: modeResults,
241
+ meta: {
242
+ indexName: options.indexName ?? "default",
243
+ collection: fixture.collection,
244
+ },
245
+ },
246
+ };
247
+ }
248
+
249
+ export function formatBench(
250
+ result: BenchResult,
251
+ options: { json?: boolean }
252
+ ): string {
253
+ if (!result.success) {
254
+ return options.json
255
+ ? JSON.stringify({
256
+ error: { code: "BENCH_FAILED", message: result.error },
257
+ })
258
+ : `Error: ${result.error}`;
259
+ }
260
+
261
+ if (options.json) {
262
+ return JSON.stringify(result.data, null, 2);
263
+ }
264
+
265
+ const lines = [
266
+ `Bench: ${result.data.fixture.name ?? result.data.fixture.path}`,
267
+ `Queries: ${result.data.fixture.queryCount} Top K: ${result.data.fixture.topK}`,
268
+ "",
269
+ "| Mode | Status | Precision@K | Recall@K | F1@K | MRR | nDCG@K | p95 ms | Failures |",
270
+ "| ---- | ------ | ----------- | -------- | ---- | --- | ------ | ------ | -------- |",
271
+ ];
272
+
273
+ for (const mode of result.data.modes) {
274
+ lines.push(
275
+ `| ${mode.name} | ${mode.status} | ${mode.metrics.precisionAtK.toFixed(3)} | ${mode.metrics.recallAtK.toFixed(3)} | ${mode.metrics.f1AtK.toFixed(3)} | ${mode.metrics.mrr.toFixed(3)} | ${mode.metrics.ndcgAtK.toFixed(3)} | ${mode.latency.p95Ms.toFixed(2)} | ${mode.failures} |`
276
+ );
277
+ }
278
+
279
+ return lines.join("\n");
280
+ }
@@ -22,6 +22,7 @@ export const CMD = {
22
22
  search: "search",
23
23
  vsearch: "vsearch",
24
24
  query: "query",
25
+ bench: "bench",
25
26
  ask: "ask",
26
27
  get: "get",
27
28
  multiGet: "multi-get",
@@ -45,6 +46,7 @@ const FORMAT_SUPPORT: Record<CommandId, OutputFormat[]> = {
45
46
  [CMD.search]: ["terminal", "json", "files", "csv", "md", "xml"],
46
47
  [CMD.vsearch]: ["terminal", "json", "files", "csv", "md", "xml"],
47
48
  [CMD.query]: ["terminal", "json", "files", "csv", "md", "xml"],
49
+ [CMD.bench]: ["terminal", "json"],
48
50
  [CMD.ask]: ["terminal", "json", "md"],
49
51
  [CMD.get]: ["terminal", "json", "md"],
50
52
  [CMD.multiGet]: ["terminal", "json", "files", "md"],
@@ -677,6 +677,58 @@ function wireSearchCommands(program: Command): void {
677
677
  await writeOutput(output, format);
678
678
  });
679
679
 
680
+ // bench - Retrieval benchmark fixture runner
681
+ program
682
+ .command("bench <fixture>")
683
+ .description("Run retrieval quality benchmarks from a fixture")
684
+ .option("-c, --collection <name>", "override fixture collection")
685
+ .option("-k, --top-k <num>", "override top-k metric cutoff")
686
+ .option(
687
+ "--mode <name>",
688
+ "benchmark mode (repeatable): bm25, vector, hybrid, fast, no-rerank, thorough",
689
+ (value: string, previous: string[] = []) => [...previous, value],
690
+ []
691
+ )
692
+ .option("-C, --candidate-limit <num>", "max candidates passed to reranking")
693
+ .option("--json", "JSON output")
694
+ .action(async (fixture: string, cmdOpts: Record<string, unknown>) => {
695
+ const format = getFormat(cmdOpts);
696
+ assertFormatSupported(CMD.bench, format);
697
+ const globals = getGlobals();
698
+ const topK = cmdOpts.topK
699
+ ? parsePositiveInt("top-k", cmdOpts.topK)
700
+ : undefined;
701
+ const candidateLimit = cmdOpts.candidateLimit
702
+ ? parsePositiveInt("candidate-limit", cmdOpts.candidateLimit)
703
+ : undefined;
704
+
705
+ const { bench, formatBench } = await import("./commands/bench");
706
+ const result = await bench(fixture, {
707
+ configPath: globals.config,
708
+ indexName: globals.index,
709
+ collection: cmdOpts.collection as string | undefined,
710
+ topK,
711
+ candidateLimit,
712
+ modes:
713
+ Array.isArray(cmdOpts.mode) && cmdOpts.mode.length > 0
714
+ ? (cmdOpts.mode as string[])
715
+ : undefined,
716
+ json: format === "json",
717
+ });
718
+
719
+ if (!result.success) {
720
+ throw new CliError(
721
+ result.isValidation ? "VALIDATION" : "RUNTIME",
722
+ result.error
723
+ );
724
+ }
725
+
726
+ await writeOutput(
727
+ formatBench(result, { json: format === "json" }),
728
+ format
729
+ );
730
+ });
731
+
680
732
  // ask - Human-friendly query with grounded answer
681
733
  program
682
734
  .command("ask <query>")