@gmickel/gno 1.4.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/package.json +2 -2
- package/src/bench/fixture.ts +247 -0
- package/src/bench/metrics.ts +137 -0
- package/src/bench/types.ts +96 -0
- package/src/cli/commands/bench.ts +280 -0
- package/src/cli/options.ts +2 -0
- package/src/cli/program.ts +52 -0
package/README.md
CHANGED
|
@@ -880,11 +880,13 @@ bun run lint && bun run typecheck
|
|
|
880
880
|
Use retrieval benchmark commands to track quality and latency over time:
|
|
881
881
|
|
|
882
882
|
```bash
|
|
883
|
+
gno bench docs/examples/bench-fixture.json
|
|
883
884
|
bun run eval:hybrid
|
|
884
885
|
bun run eval:hybrid:baseline
|
|
885
886
|
bun run eval:hybrid:delta
|
|
886
887
|
```
|
|
887
888
|
|
|
889
|
+
- Public fixture runner: `gno bench <fixture.json>` reports Precision@K, Recall@K, F1@K, MRR, nDCG@K, and latency across BM25/vector/hybrid modes.
|
|
888
890
|
- Benchmark guide: [evals/README.md](./evals/README.md)
|
|
889
891
|
- Latest baseline snapshot: [evals/fixtures/hybrid-baseline/latest.json](./evals/fixtures/hybrid-baseline/latest.json)
|
|
890
892
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gmickel/gno",
|
|
3
|
-
"version": "1.4.
|
|
3
|
+
"version": "1.4.2",
|
|
4
4
|
"description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"embeddings",
|
|
@@ -158,7 +158,7 @@
|
|
|
158
158
|
"franc": "6.2.0",
|
|
159
159
|
"lucide-react": "1.8.0",
|
|
160
160
|
"markitdown-ts": "0.0.9",
|
|
161
|
-
"minimatch": "10.
|
|
161
|
+
"minimatch": "10.2.3",
|
|
162
162
|
"nanoid": "5.1.6",
|
|
163
163
|
"node-llama-cpp": "3.18.1",
|
|
164
164
|
"officeparser": "6.0.4",
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
|
|
3
|
+
import type { BenchFixture, BenchMode, BenchOptions } from "./types";
|
|
4
|
+
|
|
5
|
+
const MODE_ALIASES = [
|
|
6
|
+
"bm25",
|
|
7
|
+
"vector",
|
|
8
|
+
"hybrid",
|
|
9
|
+
"fast",
|
|
10
|
+
"no-rerank",
|
|
11
|
+
"thorough",
|
|
12
|
+
] as const;
|
|
13
|
+
|
|
14
|
+
type BenchModeAlias = (typeof MODE_ALIASES)[number];
|
|
15
|
+
|
|
16
|
+
const queryModeInputSchema = z.object({
|
|
17
|
+
mode: z.enum(["term", "intent", "hyde"]),
|
|
18
|
+
text: z.string().trim().min(1),
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
const modeObjectSchema = z.object({
|
|
22
|
+
name: z.string().trim().min(1).optional(),
|
|
23
|
+
type: z.enum(["bm25", "vector", "hybrid"]).optional(),
|
|
24
|
+
mode: z.enum(MODE_ALIASES).optional(),
|
|
25
|
+
noExpand: z.boolean().optional(),
|
|
26
|
+
noRerank: z.boolean().optional(),
|
|
27
|
+
candidateLimit: z.number().int().positive().optional(),
|
|
28
|
+
limit: z.number().int().positive().optional(),
|
|
29
|
+
queryModes: z.array(queryModeInputSchema).optional(),
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const fixtureSchema = z.object({
|
|
33
|
+
version: z.literal(1),
|
|
34
|
+
metadata: z
|
|
35
|
+
.object({
|
|
36
|
+
name: z.string().optional(),
|
|
37
|
+
description: z.string().optional(),
|
|
38
|
+
tags: z.array(z.string()).optional(),
|
|
39
|
+
})
|
|
40
|
+
.optional(),
|
|
41
|
+
collection: z.string().trim().min(1).optional(),
|
|
42
|
+
topK: z.number().int().positive().optional(),
|
|
43
|
+
candidateLimit: z.number().int().positive().optional(),
|
|
44
|
+
modes: z.array(z.union([z.enum(MODE_ALIASES), modeObjectSchema])).optional(),
|
|
45
|
+
queries: z
|
|
46
|
+
.array(
|
|
47
|
+
z.object({
|
|
48
|
+
id: z.string().trim().min(1),
|
|
49
|
+
query: z.string().trim().min(1),
|
|
50
|
+
expected: z.array(z.string().trim().min(1)).optional(),
|
|
51
|
+
expectedDocuments: z.array(z.string().trim().min(1)).optional(),
|
|
52
|
+
expectedUris: z.array(z.string().trim().min(1)).optional(),
|
|
53
|
+
judgments: z
|
|
54
|
+
.array(
|
|
55
|
+
z.object({
|
|
56
|
+
docid: z.string().trim().min(1).optional(),
|
|
57
|
+
doc: z.string().trim().min(1).optional(),
|
|
58
|
+
uri: z.string().trim().min(1).optional(),
|
|
59
|
+
relevance: z.number().min(0),
|
|
60
|
+
})
|
|
61
|
+
)
|
|
62
|
+
.optional(),
|
|
63
|
+
collection: z.string().trim().min(1).optional(),
|
|
64
|
+
topK: z.number().int().positive().optional(),
|
|
65
|
+
queryModes: z.array(queryModeInputSchema).optional(),
|
|
66
|
+
})
|
|
67
|
+
)
|
|
68
|
+
.min(1),
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
type FixtureModeInput = NonNullable<
|
|
72
|
+
z.infer<typeof fixtureSchema>["modes"]
|
|
73
|
+
>[number];
|
|
74
|
+
|
|
75
|
+
export function normalizeBenchRef(value: string): string {
|
|
76
|
+
const trimmed = value.trim();
|
|
77
|
+
const queryIndex = trimmed.indexOf("?");
|
|
78
|
+
return queryIndex === -1 ? trimmed : trimmed.slice(0, queryIndex);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function normalizeMode(alias: BenchModeAlias): BenchMode {
|
|
82
|
+
switch (alias) {
|
|
83
|
+
case "bm25":
|
|
84
|
+
return { name: "bm25", type: "bm25" };
|
|
85
|
+
case "vector":
|
|
86
|
+
return { name: "vector", type: "vector" };
|
|
87
|
+
case "fast":
|
|
88
|
+
return {
|
|
89
|
+
name: "fast",
|
|
90
|
+
type: "hybrid",
|
|
91
|
+
noExpand: true,
|
|
92
|
+
noRerank: true,
|
|
93
|
+
};
|
|
94
|
+
case "no-rerank":
|
|
95
|
+
return { name: "no-rerank", type: "hybrid", noRerank: true };
|
|
96
|
+
case "thorough":
|
|
97
|
+
return { name: "thorough", type: "hybrid", depth: "thorough" };
|
|
98
|
+
case "hybrid":
|
|
99
|
+
return { name: "hybrid", type: "hybrid" };
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function normalizeModeInput(input: FixtureModeInput): BenchMode {
|
|
104
|
+
if (typeof input === "string") {
|
|
105
|
+
return normalizeMode(input as BenchModeAlias);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const base = input.mode ? normalizeMode(input.mode) : undefined;
|
|
109
|
+
const type = input.type ?? base?.type ?? "hybrid";
|
|
110
|
+
const name = input.name ?? input.mode ?? type;
|
|
111
|
+
return {
|
|
112
|
+
...base,
|
|
113
|
+
name,
|
|
114
|
+
type,
|
|
115
|
+
depth: base?.depth,
|
|
116
|
+
noExpand: input.noExpand ?? base?.noExpand,
|
|
117
|
+
noRerank: input.noRerank ?? base?.noRerank,
|
|
118
|
+
candidateLimit: input.candidateLimit,
|
|
119
|
+
limit: input.limit,
|
|
120
|
+
queryModes: input.queryModes,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function parseModeFlag(
|
|
125
|
+
mode: string
|
|
126
|
+
): { ok: true; value: BenchMode } | { ok: false; error: string } {
|
|
127
|
+
const normalized = mode.trim() as BenchModeAlias;
|
|
128
|
+
if (!MODE_ALIASES.includes(normalized)) {
|
|
129
|
+
return {
|
|
130
|
+
ok: false,
|
|
131
|
+
error: `Unsupported bench mode: ${mode}. Supported: ${MODE_ALIASES.join(", ")}`,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
return { ok: true, value: normalizeMode(normalized) };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function normalizeModes(
|
|
138
|
+
fixtureModes: z.infer<typeof fixtureSchema>["modes"],
|
|
139
|
+
optionModes?: string[]
|
|
140
|
+
): BenchMode[] {
|
|
141
|
+
if (optionModes?.length) {
|
|
142
|
+
return optionModes.map((mode) => {
|
|
143
|
+
const parsed = parseModeFlag(mode);
|
|
144
|
+
if (!parsed.ok) {
|
|
145
|
+
throw new Error(parsed.error);
|
|
146
|
+
}
|
|
147
|
+
return parsed.value;
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return (fixtureModes ?? ["bm25"]).map(normalizeModeInput);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function normalizeFixture(
|
|
155
|
+
parsed: z.infer<typeof fixtureSchema>,
|
|
156
|
+
options: BenchOptions
|
|
157
|
+
): BenchFixture {
|
|
158
|
+
const modes = normalizeModes(parsed.modes, options.modes);
|
|
159
|
+
const topK = options.topK ?? parsed.topK ?? 10;
|
|
160
|
+
const candidateLimit = options.candidateLimit ?? parsed.candidateLimit;
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
version: parsed.version,
|
|
164
|
+
metadata: parsed.metadata,
|
|
165
|
+
collection: options.collection ?? parsed.collection,
|
|
166
|
+
topK,
|
|
167
|
+
candidateLimit,
|
|
168
|
+
modes,
|
|
169
|
+
queries: parsed.queries.map((entry) => {
|
|
170
|
+
const explicitExpected = [
|
|
171
|
+
...(entry.expected ?? []),
|
|
172
|
+
...(entry.expectedDocuments ?? []),
|
|
173
|
+
...(entry.expectedUris ?? []),
|
|
174
|
+
].map(normalizeBenchRef);
|
|
175
|
+
const judgments =
|
|
176
|
+
entry.judgments?.flatMap((judgment) => {
|
|
177
|
+
const docid = judgment.docid ?? judgment.doc ?? judgment.uri;
|
|
178
|
+
return docid
|
|
179
|
+
? [
|
|
180
|
+
{
|
|
181
|
+
docid: normalizeBenchRef(docid),
|
|
182
|
+
relevance: judgment.relevance,
|
|
183
|
+
},
|
|
184
|
+
]
|
|
185
|
+
: [];
|
|
186
|
+
}) ?? [];
|
|
187
|
+
const expected =
|
|
188
|
+
explicitExpected.length > 0
|
|
189
|
+
? explicitExpected
|
|
190
|
+
: judgments.map((judgment) => judgment.docid);
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
id: entry.id,
|
|
194
|
+
query: entry.query,
|
|
195
|
+
expected,
|
|
196
|
+
judgments,
|
|
197
|
+
collection: options.collection ?? entry.collection ?? parsed.collection,
|
|
198
|
+
topK: entry.topK,
|
|
199
|
+
queryModes: entry.queryModes,
|
|
200
|
+
};
|
|
201
|
+
}),
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export async function loadBenchFixture(
|
|
206
|
+
fixturePath: string,
|
|
207
|
+
options: BenchOptions
|
|
208
|
+
): Promise<{ ok: true; fixture: BenchFixture } | { ok: false; error: string }> {
|
|
209
|
+
const file = Bun.file(fixturePath);
|
|
210
|
+
if (!(await file.exists())) {
|
|
211
|
+
return { ok: false, error: `Fixture not found: ${fixturePath}` };
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
let raw: unknown;
|
|
215
|
+
try {
|
|
216
|
+
raw = JSON.parse(await file.text());
|
|
217
|
+
} catch (error) {
|
|
218
|
+
return {
|
|
219
|
+
ok: false,
|
|
220
|
+
error: `Invalid JSON fixture: ${error instanceof Error ? error.message : String(error)}`,
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const parsed = fixtureSchema.safeParse(raw);
|
|
225
|
+
if (!parsed.success) {
|
|
226
|
+
return { ok: false, error: z.prettifyError(parsed.error) };
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
try {
|
|
230
|
+
const fixture = normalizeFixture(parsed.data, options);
|
|
231
|
+
const missingExpected = fixture.queries.find(
|
|
232
|
+
(entry) => entry.expected.length === 0
|
|
233
|
+
);
|
|
234
|
+
if (missingExpected) {
|
|
235
|
+
return {
|
|
236
|
+
ok: false,
|
|
237
|
+
error: `Bench query "${missingExpected.id}" must define expected documents, expected URIs, or judgments`,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
return { ok: true, fixture };
|
|
241
|
+
} catch (error) {
|
|
242
|
+
return {
|
|
243
|
+
ok: false,
|
|
244
|
+
error: error instanceof Error ? error.message : String(error),
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Retrieval benchmark metric helpers.
|
|
3
|
+
*
|
|
4
|
+
* @module src/bench/metrics
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export interface RelevanceJudgment {
|
|
8
|
+
docid: string;
|
|
9
|
+
relevance: number;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface RetrievalMetrics {
|
|
13
|
+
precisionAtK: number;
|
|
14
|
+
recallAtK: number;
|
|
15
|
+
f1AtK: number;
|
|
16
|
+
mrr: number;
|
|
17
|
+
ndcgAtK: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function round(value: number, places = 4): number {
|
|
21
|
+
return Number(value.toFixed(places));
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Compute Precision@K: fraction of retrieved top-K docs that are relevant.
|
|
26
|
+
*/
|
|
27
|
+
export function computePrecision(
|
|
28
|
+
output: string[],
|
|
29
|
+
expected: string[],
|
|
30
|
+
k: number
|
|
31
|
+
): number {
|
|
32
|
+
if (k <= 0) {
|
|
33
|
+
return 0;
|
|
34
|
+
}
|
|
35
|
+
const expectedSet = new Set(expected);
|
|
36
|
+
const hits = output.slice(0, k).filter((docid) => expectedSet.has(docid));
|
|
37
|
+
return hits.length / k;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Compute Recall@K: fraction of relevant docs in top K results.
|
|
42
|
+
*/
|
|
43
|
+
export function computeRecall(
|
|
44
|
+
output: string[],
|
|
45
|
+
expected: string[],
|
|
46
|
+
k: number
|
|
47
|
+
): number {
|
|
48
|
+
if (expected.length === 0) return 1;
|
|
49
|
+
const topK = output.slice(0, k);
|
|
50
|
+
const hits = expected.filter((docid) => topK.includes(docid)).length;
|
|
51
|
+
return hits / expected.length;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Compute F1@K from precision and recall.
|
|
56
|
+
*/
|
|
57
|
+
export function computeF1(precision: number, recall: number): number {
|
|
58
|
+
if (precision === 0 && recall === 0) {
|
|
59
|
+
return 0;
|
|
60
|
+
}
|
|
61
|
+
return (2 * precision * recall) / (precision + recall);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Compute nDCG@K: normalized discounted cumulative gain.
|
|
66
|
+
*/
|
|
67
|
+
export function computeNdcg(
|
|
68
|
+
output: string[],
|
|
69
|
+
judgments: RelevanceJudgment[],
|
|
70
|
+
k: number
|
|
71
|
+
): number {
|
|
72
|
+
if (judgments.length === 0) return 1;
|
|
73
|
+
const relMap = new Map(judgments.map((j) => [j.docid, j.relevance]));
|
|
74
|
+
const dcg = output.slice(0, k).reduce((sum, docid, i) => {
|
|
75
|
+
const rel = relMap.get(docid) ?? 0;
|
|
76
|
+
return sum + (2 ** rel - 1) / Math.log2(i + 2);
|
|
77
|
+
}, 0);
|
|
78
|
+
const idcg = [...judgments]
|
|
79
|
+
.sort((a, b) => b.relevance - a.relevance)
|
|
80
|
+
.slice(0, k)
|
|
81
|
+
.reduce((sum, j, i) => sum + (2 ** j.relevance - 1) / Math.log2(i + 2), 0);
|
|
82
|
+
return idcg > 0 ? dcg / idcg : 1;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Compute Mean Reciprocal Rank (single-query form).
|
|
87
|
+
* Returns reciprocal rank of first relevant hit in output.
|
|
88
|
+
*/
|
|
89
|
+
export function computeMrr(output: string[], expected: string[]): number {
|
|
90
|
+
if (expected.length === 0) {
|
|
91
|
+
return 1;
|
|
92
|
+
}
|
|
93
|
+
const expectedSet = new Set(expected);
|
|
94
|
+
for (const [index, docid] of output.entries()) {
|
|
95
|
+
if (expectedSet.has(docid)) {
|
|
96
|
+
return 1 / (index + 1);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return 0;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export function computeRetrievalMetrics(input: {
|
|
103
|
+
output: string[];
|
|
104
|
+
expected: string[];
|
|
105
|
+
judgments: RelevanceJudgment[];
|
|
106
|
+
k: number;
|
|
107
|
+
}): RetrievalMetrics {
|
|
108
|
+
const precision = computePrecision(input.output, input.expected, input.k);
|
|
109
|
+
const recall = computeRecall(input.output, input.expected, input.k);
|
|
110
|
+
const judgmentSource =
|
|
111
|
+
input.judgments.length > 0
|
|
112
|
+
? input.judgments
|
|
113
|
+
: input.expected.map((docid) => ({ docid, relevance: 1 }));
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
precisionAtK: round(precision),
|
|
117
|
+
recallAtK: round(recall),
|
|
118
|
+
f1AtK: round(computeF1(precision, recall)),
|
|
119
|
+
mrr: round(computeMrr(input.output, input.expected)),
|
|
120
|
+
ndcgAtK: round(computeNdcg(input.output, judgmentSource, input.k)),
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export function averageMetrics(metrics: RetrievalMetrics[]): RetrievalMetrics {
|
|
125
|
+
const average = (values: number[]): number =>
|
|
126
|
+
values.length === 0
|
|
127
|
+
? 0
|
|
128
|
+
: values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
precisionAtK: round(average(metrics.map((m) => m.precisionAtK))),
|
|
132
|
+
recallAtK: round(average(metrics.map((m) => m.recallAtK))),
|
|
133
|
+
f1AtK: round(average(metrics.map((m) => m.f1AtK))),
|
|
134
|
+
mrr: round(average(metrics.map((m) => m.mrr))),
|
|
135
|
+
ndcgAtK: round(average(metrics.map((m) => m.ndcgAtK))),
|
|
136
|
+
};
|
|
137
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { QueryModeInput } from "../pipeline/types";
|
|
2
|
+
import type { RelevanceJudgment, RetrievalMetrics } from "./metrics";
|
|
3
|
+
|
|
4
|
+
export type BenchModeType = "bm25" | "vector" | "hybrid";
|
|
5
|
+
|
|
6
|
+
export interface BenchMode {
|
|
7
|
+
name: string;
|
|
8
|
+
type: BenchModeType;
|
|
9
|
+
depth?: "thorough";
|
|
10
|
+
noExpand?: boolean;
|
|
11
|
+
noRerank?: boolean;
|
|
12
|
+
candidateLimit?: number;
|
|
13
|
+
limit?: number;
|
|
14
|
+
queryModes?: QueryModeInput[];
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface BenchCase {
|
|
18
|
+
id: string;
|
|
19
|
+
query: string;
|
|
20
|
+
expected: string[];
|
|
21
|
+
judgments: RelevanceJudgment[];
|
|
22
|
+
collection?: string;
|
|
23
|
+
topK?: number;
|
|
24
|
+
queryModes?: QueryModeInput[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface BenchFixture {
|
|
28
|
+
version: 1;
|
|
29
|
+
metadata?: {
|
|
30
|
+
name?: string;
|
|
31
|
+
description?: string;
|
|
32
|
+
tags?: string[];
|
|
33
|
+
};
|
|
34
|
+
collection?: string;
|
|
35
|
+
topK: number;
|
|
36
|
+
candidateLimit?: number;
|
|
37
|
+
modes: BenchMode[];
|
|
38
|
+
queries: BenchCase[];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface BenchOptions {
|
|
42
|
+
configPath?: string;
|
|
43
|
+
indexName?: string;
|
|
44
|
+
collection?: string;
|
|
45
|
+
topK?: number;
|
|
46
|
+
candidateLimit?: number;
|
|
47
|
+
modes?: string[];
|
|
48
|
+
json?: boolean;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface BenchCaseResult {
|
|
52
|
+
id: string;
|
|
53
|
+
query: string;
|
|
54
|
+
topK: number;
|
|
55
|
+
expected: string[];
|
|
56
|
+
hits: string[];
|
|
57
|
+
topDocs: string[];
|
|
58
|
+
metrics: RetrievalMetrics;
|
|
59
|
+
latencyMs: number;
|
|
60
|
+
error?: string;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export interface BenchModeResult {
|
|
64
|
+
name: string;
|
|
65
|
+
type: BenchModeType;
|
|
66
|
+
status: "ok" | "failed";
|
|
67
|
+
queryCount: number;
|
|
68
|
+
failures: number;
|
|
69
|
+
metrics: RetrievalMetrics;
|
|
70
|
+
latency: {
|
|
71
|
+
p50Ms: number;
|
|
72
|
+
p95Ms: number;
|
|
73
|
+
meanMs: number;
|
|
74
|
+
};
|
|
75
|
+
cases: BenchCaseResult[];
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export interface BenchOutput {
|
|
79
|
+
fixture: {
|
|
80
|
+
path: string;
|
|
81
|
+
name?: string;
|
|
82
|
+
version: 1;
|
|
83
|
+
queryCount: number;
|
|
84
|
+
topK: number;
|
|
85
|
+
};
|
|
86
|
+
generatedAt: string;
|
|
87
|
+
modes: BenchModeResult[];
|
|
88
|
+
meta: {
|
|
89
|
+
indexName: string;
|
|
90
|
+
collection?: string;
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export type BenchResult =
|
|
95
|
+
| { success: true; data: BenchOutput }
|
|
96
|
+
| { success: false; error: string; isValidation?: boolean };
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* gno bench command implementation.
|
|
3
|
+
* Runs retrieval benchmarks from user fixtures.
|
|
4
|
+
*
|
|
5
|
+
* @module src/cli/commands/bench
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
BenchCase,
|
|
10
|
+
BenchCaseResult,
|
|
11
|
+
BenchMode,
|
|
12
|
+
BenchModeResult,
|
|
13
|
+
BenchOptions,
|
|
14
|
+
BenchResult,
|
|
15
|
+
} from "../../bench/types";
|
|
16
|
+
import type { SearchResult } from "../../pipeline/types";
|
|
17
|
+
|
|
18
|
+
import { loadBenchFixture, normalizeBenchRef } from "../../bench/fixture";
|
|
19
|
+
import { averageMetrics, computeRetrievalMetrics } from "../../bench/metrics";
|
|
20
|
+
import { DEFAULT_THOROUGH_CANDIDATE_LIMIT } from "../../core/depth-policy";
|
|
21
|
+
import { query } from "./query";
|
|
22
|
+
import { search } from "./search";
|
|
23
|
+
import { vsearch } from "./vsearch";
|
|
24
|
+
|
|
25
|
+
function round(value: number, places = 2): number {
|
|
26
|
+
return Number(value.toFixed(places));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function summarizeLatency(values: number[]): BenchModeResult["latency"] {
|
|
30
|
+
if (values.length === 0) {
|
|
31
|
+
return { p50Ms: 0, p95Ms: 0, meanMs: 0 };
|
|
32
|
+
}
|
|
33
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
34
|
+
const percentile = (p: number): number => {
|
|
35
|
+
const index = Math.ceil((p / 100) * sorted.length) - 1;
|
|
36
|
+
return sorted[Math.max(0, Math.min(sorted.length - 1, index))] ?? 0;
|
|
37
|
+
};
|
|
38
|
+
return {
|
|
39
|
+
p50Ms: round(percentile(50)),
|
|
40
|
+
p95Ms: round(percentile(95)),
|
|
41
|
+
meanMs: round(
|
|
42
|
+
values.reduce((sum, value) => sum + value, 0) / values.length
|
|
43
|
+
),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function resultRefs(result: SearchResult): Set<string> {
|
|
48
|
+
return new Set(
|
|
49
|
+
[
|
|
50
|
+
result.docid,
|
|
51
|
+
result.uri,
|
|
52
|
+
normalizeBenchRef(result.uri),
|
|
53
|
+
result.source.relPath,
|
|
54
|
+
result.title,
|
|
55
|
+
].filter((value): value is string => Boolean(value))
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function findHits(
|
|
60
|
+
results: SearchResult[],
|
|
61
|
+
expected: string[],
|
|
62
|
+
k: number
|
|
63
|
+
): string[] {
|
|
64
|
+
const hits: string[] = [];
|
|
65
|
+
const expectedSet = new Set(expected.map(normalizeBenchRef));
|
|
66
|
+
|
|
67
|
+
for (const result of results.slice(0, k)) {
|
|
68
|
+
const refs = resultRefs(result);
|
|
69
|
+
const hit = [...expectedSet].find((expectedRef) => refs.has(expectedRef));
|
|
70
|
+
if (hit && !hits.includes(hit)) {
|
|
71
|
+
hits.push(hit);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return hits;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function topDocs(results: SearchResult[]): string[] {
|
|
79
|
+
return results.map((result) => result.source.relPath);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function rankedMetricDocs(
|
|
83
|
+
results: SearchResult[],
|
|
84
|
+
expected: string[]
|
|
85
|
+
): string[] {
|
|
86
|
+
const expectedSet = new Set(expected.map(normalizeBenchRef));
|
|
87
|
+
return results.map((result) => {
|
|
88
|
+
const refs = resultRefs(result);
|
|
89
|
+
return (
|
|
90
|
+
[...expectedSet].find((expectedRef) => refs.has(expectedRef)) ??
|
|
91
|
+
result.source.relPath
|
|
92
|
+
);
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async function runModeCase(input: {
|
|
97
|
+
mode: BenchMode;
|
|
98
|
+
benchCase: BenchCase;
|
|
99
|
+
topK: number;
|
|
100
|
+
candidateLimit?: number;
|
|
101
|
+
options: BenchOptions;
|
|
102
|
+
}): Promise<BenchCaseResult> {
|
|
103
|
+
const { mode, benchCase, topK, options } = input;
|
|
104
|
+
const limit = mode.limit ?? topK;
|
|
105
|
+
const candidateLimit =
|
|
106
|
+
mode.candidateLimit ??
|
|
107
|
+
input.candidateLimit ??
|
|
108
|
+
(mode.depth === "thorough" ? DEFAULT_THOROUGH_CANDIDATE_LIMIT : undefined);
|
|
109
|
+
const startedAt = performance.now();
|
|
110
|
+
const queryModes = benchCase.queryModes ?? mode.queryModes;
|
|
111
|
+
let result:
|
|
112
|
+
| Awaited<ReturnType<typeof search>>
|
|
113
|
+
| Awaited<ReturnType<typeof vsearch>>
|
|
114
|
+
| Awaited<ReturnType<typeof query>>;
|
|
115
|
+
|
|
116
|
+
if (mode.type === "bm25") {
|
|
117
|
+
result = await search(benchCase.query, {
|
|
118
|
+
configPath: options.configPath,
|
|
119
|
+
indexName: options.indexName,
|
|
120
|
+
collection: benchCase.collection,
|
|
121
|
+
limit,
|
|
122
|
+
json: true,
|
|
123
|
+
});
|
|
124
|
+
} else if (mode.type === "vector") {
|
|
125
|
+
result = await vsearch(benchCase.query, {
|
|
126
|
+
configPath: options.configPath,
|
|
127
|
+
indexName: options.indexName,
|
|
128
|
+
collection: benchCase.collection,
|
|
129
|
+
limit,
|
|
130
|
+
json: true,
|
|
131
|
+
});
|
|
132
|
+
} else {
|
|
133
|
+
result = await query(benchCase.query, {
|
|
134
|
+
configPath: options.configPath,
|
|
135
|
+
indexName: options.indexName,
|
|
136
|
+
collection: benchCase.collection,
|
|
137
|
+
limit,
|
|
138
|
+
candidateLimit,
|
|
139
|
+
noExpand: mode.noExpand,
|
|
140
|
+
noRerank: mode.noRerank,
|
|
141
|
+
queryModes,
|
|
142
|
+
json: true,
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const latencyMs = round(performance.now() - startedAt);
|
|
147
|
+
if (!result.success) {
|
|
148
|
+
return {
|
|
149
|
+
id: benchCase.id,
|
|
150
|
+
query: benchCase.query,
|
|
151
|
+
topK,
|
|
152
|
+
expected: benchCase.expected,
|
|
153
|
+
hits: [],
|
|
154
|
+
topDocs: [],
|
|
155
|
+
metrics: computeRetrievalMetrics({
|
|
156
|
+
output: [],
|
|
157
|
+
expected: benchCase.expected,
|
|
158
|
+
judgments: benchCase.judgments,
|
|
159
|
+
k: topK,
|
|
160
|
+
}),
|
|
161
|
+
latencyMs,
|
|
162
|
+
error: result.error,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const docs = topDocs(result.data.results);
|
|
167
|
+
const metricDocs = rankedMetricDocs(result.data.results, benchCase.expected);
|
|
168
|
+
const hits = findHits(result.data.results, benchCase.expected, topK);
|
|
169
|
+
return {
|
|
170
|
+
id: benchCase.id,
|
|
171
|
+
query: benchCase.query,
|
|
172
|
+
topK,
|
|
173
|
+
expected: benchCase.expected,
|
|
174
|
+
hits,
|
|
175
|
+
topDocs: docs.slice(0, topK),
|
|
176
|
+
metrics: computeRetrievalMetrics({
|
|
177
|
+
output: metricDocs,
|
|
178
|
+
expected: benchCase.expected,
|
|
179
|
+
judgments: benchCase.judgments,
|
|
180
|
+
k: topK,
|
|
181
|
+
}),
|
|
182
|
+
latencyMs,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Execute gno bench command.
|
|
188
|
+
*/
|
|
189
|
+
export async function bench(
|
|
190
|
+
fixturePath: string,
|
|
191
|
+
options: BenchOptions = {}
|
|
192
|
+
): Promise<BenchResult> {
|
|
193
|
+
const loaded = await loadBenchFixture(fixturePath, options);
|
|
194
|
+
if (!loaded.ok) {
|
|
195
|
+
return { success: false, error: loaded.error, isValidation: true };
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const { fixture } = loaded;
|
|
199
|
+
const modeResults: BenchModeResult[] = [];
|
|
200
|
+
|
|
201
|
+
for (const mode of fixture.modes) {
|
|
202
|
+
const cases: BenchCaseResult[] = [];
|
|
203
|
+
for (const benchCase of fixture.queries) {
|
|
204
|
+
const topK = benchCase.topK ?? fixture.topK;
|
|
205
|
+
cases.push(
|
|
206
|
+
await runModeCase({
|
|
207
|
+
mode,
|
|
208
|
+
benchCase,
|
|
209
|
+
topK,
|
|
210
|
+
candidateLimit: fixture.candidateLimit,
|
|
211
|
+
options,
|
|
212
|
+
})
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const failures = cases.filter((entry) => entry.error).length;
|
|
217
|
+
modeResults.push({
|
|
218
|
+
name: mode.name,
|
|
219
|
+
type: mode.type,
|
|
220
|
+
status: failures === cases.length ? "failed" : "ok",
|
|
221
|
+
queryCount: cases.length,
|
|
222
|
+
failures,
|
|
223
|
+
metrics: averageMetrics(cases.map((entry) => entry.metrics)),
|
|
224
|
+
latency: summarizeLatency(cases.map((entry) => entry.latencyMs)),
|
|
225
|
+
cases,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
return {
|
|
230
|
+
success: true,
|
|
231
|
+
data: {
|
|
232
|
+
fixture: {
|
|
233
|
+
path: fixturePath,
|
|
234
|
+
name: fixture.metadata?.name,
|
|
235
|
+
version: fixture.version,
|
|
236
|
+
queryCount: fixture.queries.length,
|
|
237
|
+
topK: fixture.topK,
|
|
238
|
+
},
|
|
239
|
+
generatedAt: new Date().toISOString(),
|
|
240
|
+
modes: modeResults,
|
|
241
|
+
meta: {
|
|
242
|
+
indexName: options.indexName ?? "default",
|
|
243
|
+
collection: fixture.collection,
|
|
244
|
+
},
|
|
245
|
+
},
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
export function formatBench(
|
|
250
|
+
result: BenchResult,
|
|
251
|
+
options: { json?: boolean }
|
|
252
|
+
): string {
|
|
253
|
+
if (!result.success) {
|
|
254
|
+
return options.json
|
|
255
|
+
? JSON.stringify({
|
|
256
|
+
error: { code: "BENCH_FAILED", message: result.error },
|
|
257
|
+
})
|
|
258
|
+
: `Error: ${result.error}`;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if (options.json) {
|
|
262
|
+
return JSON.stringify(result.data, null, 2);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const lines = [
|
|
266
|
+
`Bench: ${result.data.fixture.name ?? result.data.fixture.path}`,
|
|
267
|
+
`Queries: ${result.data.fixture.queryCount} Top K: ${result.data.fixture.topK}`,
|
|
268
|
+
"",
|
|
269
|
+
"| Mode | Status | Precision@K | Recall@K | F1@K | MRR | nDCG@K | p95 ms | Failures |",
|
|
270
|
+
"| ---- | ------ | ----------- | -------- | ---- | --- | ------ | ------ | -------- |",
|
|
271
|
+
];
|
|
272
|
+
|
|
273
|
+
for (const mode of result.data.modes) {
|
|
274
|
+
lines.push(
|
|
275
|
+
`| ${mode.name} | ${mode.status} | ${mode.metrics.precisionAtK.toFixed(3)} | ${mode.metrics.recallAtK.toFixed(3)} | ${mode.metrics.f1AtK.toFixed(3)} | ${mode.metrics.mrr.toFixed(3)} | ${mode.metrics.ndcgAtK.toFixed(3)} | ${mode.latency.p95Ms.toFixed(2)} | ${mode.failures} |`
|
|
276
|
+
);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
return lines.join("\n");
|
|
280
|
+
}
|
package/src/cli/options.ts
CHANGED
|
@@ -22,6 +22,7 @@ export const CMD = {
|
|
|
22
22
|
search: "search",
|
|
23
23
|
vsearch: "vsearch",
|
|
24
24
|
query: "query",
|
|
25
|
+
bench: "bench",
|
|
25
26
|
ask: "ask",
|
|
26
27
|
get: "get",
|
|
27
28
|
multiGet: "multi-get",
|
|
@@ -45,6 +46,7 @@ const FORMAT_SUPPORT: Record<CommandId, OutputFormat[]> = {
|
|
|
45
46
|
[CMD.search]: ["terminal", "json", "files", "csv", "md", "xml"],
|
|
46
47
|
[CMD.vsearch]: ["terminal", "json", "files", "csv", "md", "xml"],
|
|
47
48
|
[CMD.query]: ["terminal", "json", "files", "csv", "md", "xml"],
|
|
49
|
+
[CMD.bench]: ["terminal", "json"],
|
|
48
50
|
[CMD.ask]: ["terminal", "json", "md"],
|
|
49
51
|
[CMD.get]: ["terminal", "json", "md"],
|
|
50
52
|
[CMD.multiGet]: ["terminal", "json", "files", "md"],
|
package/src/cli/program.ts
CHANGED
|
@@ -677,6 +677,58 @@ function wireSearchCommands(program: Command): void {
|
|
|
677
677
|
await writeOutput(output, format);
|
|
678
678
|
});
|
|
679
679
|
|
|
680
|
+
// bench - Retrieval benchmark fixture runner
|
|
681
|
+
program
|
|
682
|
+
.command("bench <fixture>")
|
|
683
|
+
.description("Run retrieval quality benchmarks from a fixture")
|
|
684
|
+
.option("-c, --collection <name>", "override fixture collection")
|
|
685
|
+
.option("-k, --top-k <num>", "override top-k metric cutoff")
|
|
686
|
+
.option(
|
|
687
|
+
"--mode <name>",
|
|
688
|
+
"benchmark mode (repeatable): bm25, vector, hybrid, fast, no-rerank, thorough",
|
|
689
|
+
(value: string, previous: string[] = []) => [...previous, value],
|
|
690
|
+
[]
|
|
691
|
+
)
|
|
692
|
+
.option("-C, --candidate-limit <num>", "max candidates passed to reranking")
|
|
693
|
+
.option("--json", "JSON output")
|
|
694
|
+
.action(async (fixture: string, cmdOpts: Record<string, unknown>) => {
|
|
695
|
+
const format = getFormat(cmdOpts);
|
|
696
|
+
assertFormatSupported(CMD.bench, format);
|
|
697
|
+
const globals = getGlobals();
|
|
698
|
+
const topK = cmdOpts.topK
|
|
699
|
+
? parsePositiveInt("top-k", cmdOpts.topK)
|
|
700
|
+
: undefined;
|
|
701
|
+
const candidateLimit = cmdOpts.candidateLimit
|
|
702
|
+
? parsePositiveInt("candidate-limit", cmdOpts.candidateLimit)
|
|
703
|
+
: undefined;
|
|
704
|
+
|
|
705
|
+
const { bench, formatBench } = await import("./commands/bench");
|
|
706
|
+
const result = await bench(fixture, {
|
|
707
|
+
configPath: globals.config,
|
|
708
|
+
indexName: globals.index,
|
|
709
|
+
collection: cmdOpts.collection as string | undefined,
|
|
710
|
+
topK,
|
|
711
|
+
candidateLimit,
|
|
712
|
+
modes:
|
|
713
|
+
Array.isArray(cmdOpts.mode) && cmdOpts.mode.length > 0
|
|
714
|
+
? (cmdOpts.mode as string[])
|
|
715
|
+
: undefined,
|
|
716
|
+
json: format === "json",
|
|
717
|
+
});
|
|
718
|
+
|
|
719
|
+
if (!result.success) {
|
|
720
|
+
throw new CliError(
|
|
721
|
+
result.isValidation ? "VALIDATION" : "RUNTIME",
|
|
722
|
+
result.error
|
|
723
|
+
);
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
await writeOutput(
|
|
727
|
+
formatBench(result, { json: format === "json" }),
|
|
728
|
+
format
|
|
729
|
+
);
|
|
730
|
+
});
|
|
731
|
+
|
|
680
732
|
// ask - Human-friendly query with grounded answer
|
|
681
733
|
program
|
|
682
734
|
.command("ask <query>")
|