@gmickel/gno 1.3.1 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/assets/skill/SKILL.md +12 -0
- package/package.json +68 -65
- package/src/bench/fixture.ts +247 -0
- package/src/bench/metrics.ts +137 -0
- package/src/bench/types.ts +96 -0
- package/src/cli/commands/bench.ts +280 -0
- package/src/cli/commands/doctor.ts +4 -1
- package/src/cli/options.ts +2 -0
- package/src/cli/program.ts +52 -0
- package/src/mcp/tools/index.ts +53 -21
- package/src/serve/public/globals.built.css +2 -2
- package/src/serve/server.ts +2 -2
- package/src/types/wasm.d.ts +4 -0
package/README.md
CHANGED
|
@@ -880,11 +880,13 @@ bun run lint && bun run typecheck
|
|
|
880
880
|
Use retrieval benchmark commands to track quality and latency over time:
|
|
881
881
|
|
|
882
882
|
```bash
|
|
883
|
+
gno bench docs/examples/bench-fixture.json
|
|
883
884
|
bun run eval:hybrid
|
|
884
885
|
bun run eval:hybrid:baseline
|
|
885
886
|
bun run eval:hybrid:delta
|
|
886
887
|
```
|
|
887
888
|
|
|
889
|
+
- Public fixture runner: `gno bench <fixture.json>` reports Precision@K, Recall@K, F1@K, MRR, nDCG@K, and latency across BM25/vector/hybrid modes.
|
|
888
890
|
- Benchmark guide: [evals/README.md](./evals/README.md)
|
|
889
891
|
- Latest baseline snapshot: [evals/fixtures/hybrid-baseline/latest.json](./evals/fixtures/hybrid-baseline/latest.json)
|
|
890
892
|
|
package/assets/skill/SKILL.md
CHANGED
|
@@ -131,6 +131,18 @@ gno query "auth" --json | jq -r '.results[0].uri' | xargs gno get
|
|
|
131
131
|
gno search "error handling" --json | jq -r '.results[].uri' | xargs gno multi-get
|
|
132
132
|
```
|
|
133
133
|
|
|
134
|
+
## MCP Retrieval Strategy
|
|
135
|
+
|
|
136
|
+
When using GNO through MCP, prefer `gno_query` first for normal questions. It returns snippets plus `uri`, `docid`, and often `line`; follow with `gno_get` using `fromLine`/`lineCount` for a bounded read, or `gno_multi_get` to batch top result refs.
|
|
137
|
+
|
|
138
|
+
Use narrower tools when the request tells you to:
|
|
139
|
+
|
|
140
|
+
- `gno_search`: exact phrase, filename, identifier, stack trace, error text
|
|
141
|
+
- `gno_vsearch`: conceptual similarity when exact wording differs
|
|
142
|
+
- `gno_status`: stale results, missing embeddings, vector unavailable
|
|
143
|
+
|
|
144
|
+
For ambiguous terms, pass `intent` instead of bloating the query text. For typed retrieval, use `queryModes`: `term` for lexical anchors, `intent` for disambiguation, one `hyde` for a hypothetical answer/document.
|
|
145
|
+
|
|
134
146
|
## Document Links & Similarity
|
|
135
147
|
|
|
136
148
|
```bash
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gmickel/gno",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.1",
|
|
4
4
|
"description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"embeddings",
|
|
@@ -69,6 +69,7 @@
|
|
|
69
69
|
"eval:hybrid": "bun --bun evalite evals/hybrid.eval.ts",
|
|
70
70
|
"eval:hybrid:baseline": "bun scripts/hybrid-benchmark.ts --write",
|
|
71
71
|
"eval:hybrid:delta": "bun scripts/hybrid-benchmark.ts --delta",
|
|
72
|
+
"bench:ast-chunking": "bun scripts/ast-chunking-benchmark.ts",
|
|
72
73
|
"bench:code-embeddings": "bun scripts/code-embedding-benchmark.ts",
|
|
73
74
|
"bench:code-embeddings:write": "bun scripts/code-embedding-benchmark.ts --write",
|
|
74
75
|
"bench:general-embeddings": "bun scripts/general-embedding-benchmark.ts",
|
|
@@ -121,7 +122,7 @@
|
|
|
121
122
|
"website:og": "bun scripts/og-screenshots.ts",
|
|
122
123
|
"website:sync-assets": "bun scripts/sync-assets.ts",
|
|
123
124
|
"sync:agents": "scripts/sync-agents.sh",
|
|
124
|
-
"build:css": "
|
|
125
|
+
"build:css": "tailwindcss -i src/serve/public/globals.css -o src/serve/public/globals.built.css --minify",
|
|
125
126
|
"serve": "bun src/index.ts serve",
|
|
126
127
|
"serve:dev": "NODE_ENV=development bun --hot src/index.ts serve",
|
|
127
128
|
"version:patch": "npm version patch --no-git-tag-version",
|
|
@@ -133,73 +134,75 @@
|
|
|
133
134
|
"prepare": "lefthook install"
|
|
134
135
|
},
|
|
135
136
|
"dependencies": {
|
|
136
|
-
"@codemirror/lang-markdown": "
|
|
137
|
-
"@codemirror/theme-one-dark": "
|
|
138
|
-
"@modelcontextprotocol/sdk": "
|
|
139
|
-
"@radix-ui/react-collapsible": "
|
|
140
|
-
"@radix-ui/react-dialog": "
|
|
141
|
-
"@radix-ui/react-dropdown-menu": "
|
|
142
|
-
"@radix-ui/react-hover-card": "
|
|
143
|
-
"@radix-ui/react-progress": "
|
|
144
|
-
"@radix-ui/react-scroll-area": "
|
|
145
|
-
"@radix-ui/react-select": "
|
|
146
|
-
"@radix-ui/react-separator": "
|
|
147
|
-
"@radix-ui/react-slot": "
|
|
148
|
-
"@radix-ui/react-tooltip": "
|
|
149
|
-
"ai": "
|
|
150
|
-
"bun-plugin-tailwind": "
|
|
151
|
-
"class-variance-authority": "
|
|
152
|
-
"clsx": "
|
|
153
|
-
"cmdk": "
|
|
154
|
-
"codemirror": "
|
|
155
|
-
"commander": "
|
|
156
|
-
"embla-carousel-react": "
|
|
157
|
-
"franc": "
|
|
158
|
-
"lucide-react": "
|
|
159
|
-
"markitdown-ts": "
|
|
160
|
-
"minimatch": "
|
|
161
|
-
"nanoid": "
|
|
162
|
-
"node-llama-cpp": "
|
|
163
|
-
"officeparser": "
|
|
164
|
-
"picocolors": "
|
|
165
|
-
"react": "
|
|
166
|
-
"react-dom": "
|
|
167
|
-
"react-force-graph-2d": "
|
|
168
|
-
"react-markdown": "
|
|
169
|
-
"rehype-sanitize": "
|
|
170
|
-
"remark-gfm": "
|
|
171
|
-
"shiki": "
|
|
172
|
-
"sqlite-vec": "
|
|
173
|
-
"streamdown": "
|
|
174
|
-
"tailwind-merge": "
|
|
175
|
-
"tailwindcss": "
|
|
176
|
-
"use-stick-to-bottom": "
|
|
177
|
-
"zod": "
|
|
137
|
+
"@codemirror/lang-markdown": "6.5.0",
|
|
138
|
+
"@codemirror/theme-one-dark": "6.1.3",
|
|
139
|
+
"@modelcontextprotocol/sdk": "1.27.1",
|
|
140
|
+
"@radix-ui/react-collapsible": "1.1.12",
|
|
141
|
+
"@radix-ui/react-dialog": "1.1.15",
|
|
142
|
+
"@radix-ui/react-dropdown-menu": "2.1.16",
|
|
143
|
+
"@radix-ui/react-hover-card": "1.1.15",
|
|
144
|
+
"@radix-ui/react-progress": "1.1.8",
|
|
145
|
+
"@radix-ui/react-scroll-area": "1.2.10",
|
|
146
|
+
"@radix-ui/react-select": "2.2.6",
|
|
147
|
+
"@radix-ui/react-separator": "1.1.8",
|
|
148
|
+
"@radix-ui/react-slot": "1.2.4",
|
|
149
|
+
"@radix-ui/react-tooltip": "1.2.8",
|
|
150
|
+
"ai": "6.0.68",
|
|
151
|
+
"bun-plugin-tailwind": "0.1.2",
|
|
152
|
+
"class-variance-authority": "0.7.1",
|
|
153
|
+
"clsx": "2.1.1",
|
|
154
|
+
"cmdk": "1.1.1",
|
|
155
|
+
"codemirror": "6.0.2",
|
|
156
|
+
"commander": "14.0.3",
|
|
157
|
+
"embla-carousel-react": "8.6.0",
|
|
158
|
+
"franc": "6.2.0",
|
|
159
|
+
"lucide-react": "1.8.0",
|
|
160
|
+
"markitdown-ts": "0.0.9",
|
|
161
|
+
"minimatch": "10.1.1",
|
|
162
|
+
"nanoid": "5.1.6",
|
|
163
|
+
"node-llama-cpp": "3.18.1",
|
|
164
|
+
"officeparser": "6.0.4",
|
|
165
|
+
"picocolors": "1.1.1",
|
|
166
|
+
"react": "19.2.4",
|
|
167
|
+
"react-dom": "19.2.4",
|
|
168
|
+
"react-force-graph-2d": "1.29.0",
|
|
169
|
+
"react-markdown": "10.1.0",
|
|
170
|
+
"rehype-sanitize": "6.0.0",
|
|
171
|
+
"remark-gfm": "4.0.1",
|
|
172
|
+
"shiki": "4.0.2",
|
|
173
|
+
"sqlite-vec": "0.1.9",
|
|
174
|
+
"streamdown": "2.1.0",
|
|
175
|
+
"tailwind-merge": "3.4.0",
|
|
176
|
+
"tailwindcss": "4.1.18",
|
|
177
|
+
"use-stick-to-bottom": "1.1.2",
|
|
178
|
+
"zod": "4.3.6"
|
|
178
179
|
},
|
|
179
180
|
"devDependencies": {
|
|
180
|
-
"@ai-sdk/openai": "
|
|
181
|
+
"@ai-sdk/openai": "3.0.25",
|
|
181
182
|
"@biomejs/biome": "2.3.14",
|
|
182
|
-
"@tailwindcss/cli": "
|
|
183
|
-
"@testing-library/react": "
|
|
184
|
-
"@testing-library/user-event": "
|
|
185
|
-
"@types/bun": "
|
|
186
|
-
"@types/react": "
|
|
187
|
-
"@types/react-dom": "
|
|
188
|
-
"
|
|
189
|
-
"ajv
|
|
190
|
-
"
|
|
191
|
-
"
|
|
192
|
-
"
|
|
193
|
-
"
|
|
194
|
-
"
|
|
195
|
-
"
|
|
196
|
-
"
|
|
197
|
-
"oxlint
|
|
198
|
-
"
|
|
199
|
-
"
|
|
200
|
-
"
|
|
183
|
+
"@tailwindcss/cli": "4.1.18",
|
|
184
|
+
"@testing-library/react": "16.3.2",
|
|
185
|
+
"@testing-library/user-event": "14.6.1",
|
|
186
|
+
"@types/bun": "1.3.8",
|
|
187
|
+
"@types/react": "19.2.14",
|
|
188
|
+
"@types/react-dom": "19.2.3",
|
|
189
|
+
"@vscode/tree-sitter-wasm": "0.3.1",
|
|
190
|
+
"ajv": "8.17.1",
|
|
191
|
+
"ajv-formats": "3.0.1",
|
|
192
|
+
"docx": "9.5.1",
|
|
193
|
+
"evalite": "1.0.0-beta.16",
|
|
194
|
+
"exceljs": "4.4.0",
|
|
195
|
+
"happy-dom": "20.8.9",
|
|
196
|
+
"lefthook": "2.1.4",
|
|
197
|
+
"oxfmt": "0.28.0",
|
|
198
|
+
"oxlint": "1.43.0",
|
|
199
|
+
"oxlint-tsgolint": "0.11.5",
|
|
200
|
+
"pdf-lib": "1.17.1",
|
|
201
|
+
"playwright": "1.58.2",
|
|
202
|
+
"pptxgenjs": "4.0.1",
|
|
201
203
|
"ultracite": "7.1.5",
|
|
202
|
-
"vitest": "
|
|
204
|
+
"vitest": "4.0.18",
|
|
205
|
+
"web-tree-sitter": "0.26.8"
|
|
203
206
|
},
|
|
204
207
|
"peerDependencies": {
|
|
205
208
|
"typescript": "^5"
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
|
|
3
|
+
import type { BenchFixture, BenchMode, BenchOptions } from "./types";
|
|
4
|
+
|
|
5
|
+
const MODE_ALIASES = [
|
|
6
|
+
"bm25",
|
|
7
|
+
"vector",
|
|
8
|
+
"hybrid",
|
|
9
|
+
"fast",
|
|
10
|
+
"no-rerank",
|
|
11
|
+
"thorough",
|
|
12
|
+
] as const;
|
|
13
|
+
|
|
14
|
+
type BenchModeAlias = (typeof MODE_ALIASES)[number];
|
|
15
|
+
|
|
16
|
+
const queryModeInputSchema = z.object({
|
|
17
|
+
mode: z.enum(["term", "intent", "hyde"]),
|
|
18
|
+
text: z.string().trim().min(1),
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
const modeObjectSchema = z.object({
|
|
22
|
+
name: z.string().trim().min(1).optional(),
|
|
23
|
+
type: z.enum(["bm25", "vector", "hybrid"]).optional(),
|
|
24
|
+
mode: z.enum(MODE_ALIASES).optional(),
|
|
25
|
+
noExpand: z.boolean().optional(),
|
|
26
|
+
noRerank: z.boolean().optional(),
|
|
27
|
+
candidateLimit: z.number().int().positive().optional(),
|
|
28
|
+
limit: z.number().int().positive().optional(),
|
|
29
|
+
queryModes: z.array(queryModeInputSchema).optional(),
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const fixtureSchema = z.object({
|
|
33
|
+
version: z.literal(1),
|
|
34
|
+
metadata: z
|
|
35
|
+
.object({
|
|
36
|
+
name: z.string().optional(),
|
|
37
|
+
description: z.string().optional(),
|
|
38
|
+
tags: z.array(z.string()).optional(),
|
|
39
|
+
})
|
|
40
|
+
.optional(),
|
|
41
|
+
collection: z.string().trim().min(1).optional(),
|
|
42
|
+
topK: z.number().int().positive().optional(),
|
|
43
|
+
candidateLimit: z.number().int().positive().optional(),
|
|
44
|
+
modes: z.array(z.union([z.enum(MODE_ALIASES), modeObjectSchema])).optional(),
|
|
45
|
+
queries: z
|
|
46
|
+
.array(
|
|
47
|
+
z.object({
|
|
48
|
+
id: z.string().trim().min(1),
|
|
49
|
+
query: z.string().trim().min(1),
|
|
50
|
+
expected: z.array(z.string().trim().min(1)).optional(),
|
|
51
|
+
expectedDocuments: z.array(z.string().trim().min(1)).optional(),
|
|
52
|
+
expectedUris: z.array(z.string().trim().min(1)).optional(),
|
|
53
|
+
judgments: z
|
|
54
|
+
.array(
|
|
55
|
+
z.object({
|
|
56
|
+
docid: z.string().trim().min(1).optional(),
|
|
57
|
+
doc: z.string().trim().min(1).optional(),
|
|
58
|
+
uri: z.string().trim().min(1).optional(),
|
|
59
|
+
relevance: z.number().min(0),
|
|
60
|
+
})
|
|
61
|
+
)
|
|
62
|
+
.optional(),
|
|
63
|
+
collection: z.string().trim().min(1).optional(),
|
|
64
|
+
topK: z.number().int().positive().optional(),
|
|
65
|
+
queryModes: z.array(queryModeInputSchema).optional(),
|
|
66
|
+
})
|
|
67
|
+
)
|
|
68
|
+
.min(1),
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
type FixtureModeInput = NonNullable<
|
|
72
|
+
z.infer<typeof fixtureSchema>["modes"]
|
|
73
|
+
>[number];
|
|
74
|
+
|
|
75
|
+
export function normalizeBenchRef(value: string): string {
|
|
76
|
+
const trimmed = value.trim();
|
|
77
|
+
const queryIndex = trimmed.indexOf("?");
|
|
78
|
+
return queryIndex === -1 ? trimmed : trimmed.slice(0, queryIndex);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function normalizeMode(alias: BenchModeAlias): BenchMode {
|
|
82
|
+
switch (alias) {
|
|
83
|
+
case "bm25":
|
|
84
|
+
return { name: "bm25", type: "bm25" };
|
|
85
|
+
case "vector":
|
|
86
|
+
return { name: "vector", type: "vector" };
|
|
87
|
+
case "fast":
|
|
88
|
+
return {
|
|
89
|
+
name: "fast",
|
|
90
|
+
type: "hybrid",
|
|
91
|
+
noExpand: true,
|
|
92
|
+
noRerank: true,
|
|
93
|
+
};
|
|
94
|
+
case "no-rerank":
|
|
95
|
+
return { name: "no-rerank", type: "hybrid", noRerank: true };
|
|
96
|
+
case "thorough":
|
|
97
|
+
return { name: "thorough", type: "hybrid", depth: "thorough" };
|
|
98
|
+
case "hybrid":
|
|
99
|
+
return { name: "hybrid", type: "hybrid" };
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function normalizeModeInput(input: FixtureModeInput): BenchMode {
|
|
104
|
+
if (typeof input === "string") {
|
|
105
|
+
return normalizeMode(input as BenchModeAlias);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const base = input.mode ? normalizeMode(input.mode) : undefined;
|
|
109
|
+
const type = input.type ?? base?.type ?? "hybrid";
|
|
110
|
+
const name = input.name ?? input.mode ?? type;
|
|
111
|
+
return {
|
|
112
|
+
...base,
|
|
113
|
+
name,
|
|
114
|
+
type,
|
|
115
|
+
depth: base?.depth,
|
|
116
|
+
noExpand: input.noExpand ?? base?.noExpand,
|
|
117
|
+
noRerank: input.noRerank ?? base?.noRerank,
|
|
118
|
+
candidateLimit: input.candidateLimit,
|
|
119
|
+
limit: input.limit,
|
|
120
|
+
queryModes: input.queryModes,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function parseModeFlag(
|
|
125
|
+
mode: string
|
|
126
|
+
): { ok: true; value: BenchMode } | { ok: false; error: string } {
|
|
127
|
+
const normalized = mode.trim() as BenchModeAlias;
|
|
128
|
+
if (!MODE_ALIASES.includes(normalized)) {
|
|
129
|
+
return {
|
|
130
|
+
ok: false,
|
|
131
|
+
error: `Unsupported bench mode: ${mode}. Supported: ${MODE_ALIASES.join(", ")}`,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
return { ok: true, value: normalizeMode(normalized) };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function normalizeModes(
|
|
138
|
+
fixtureModes: z.infer<typeof fixtureSchema>["modes"],
|
|
139
|
+
optionModes?: string[]
|
|
140
|
+
): BenchMode[] {
|
|
141
|
+
if (optionModes?.length) {
|
|
142
|
+
return optionModes.map((mode) => {
|
|
143
|
+
const parsed = parseModeFlag(mode);
|
|
144
|
+
if (!parsed.ok) {
|
|
145
|
+
throw new Error(parsed.error);
|
|
146
|
+
}
|
|
147
|
+
return parsed.value;
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return (fixtureModes ?? ["bm25"]).map(normalizeModeInput);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function normalizeFixture(
|
|
155
|
+
parsed: z.infer<typeof fixtureSchema>,
|
|
156
|
+
options: BenchOptions
|
|
157
|
+
): BenchFixture {
|
|
158
|
+
const modes = normalizeModes(parsed.modes, options.modes);
|
|
159
|
+
const topK = options.topK ?? parsed.topK ?? 10;
|
|
160
|
+
const candidateLimit = options.candidateLimit ?? parsed.candidateLimit;
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
version: parsed.version,
|
|
164
|
+
metadata: parsed.metadata,
|
|
165
|
+
collection: options.collection ?? parsed.collection,
|
|
166
|
+
topK,
|
|
167
|
+
candidateLimit,
|
|
168
|
+
modes,
|
|
169
|
+
queries: parsed.queries.map((entry) => {
|
|
170
|
+
const explicitExpected = [
|
|
171
|
+
...(entry.expected ?? []),
|
|
172
|
+
...(entry.expectedDocuments ?? []),
|
|
173
|
+
...(entry.expectedUris ?? []),
|
|
174
|
+
].map(normalizeBenchRef);
|
|
175
|
+
const judgments =
|
|
176
|
+
entry.judgments?.flatMap((judgment) => {
|
|
177
|
+
const docid = judgment.docid ?? judgment.doc ?? judgment.uri;
|
|
178
|
+
return docid
|
|
179
|
+
? [
|
|
180
|
+
{
|
|
181
|
+
docid: normalizeBenchRef(docid),
|
|
182
|
+
relevance: judgment.relevance,
|
|
183
|
+
},
|
|
184
|
+
]
|
|
185
|
+
: [];
|
|
186
|
+
}) ?? [];
|
|
187
|
+
const expected =
|
|
188
|
+
explicitExpected.length > 0
|
|
189
|
+
? explicitExpected
|
|
190
|
+
: judgments.map((judgment) => judgment.docid);
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
id: entry.id,
|
|
194
|
+
query: entry.query,
|
|
195
|
+
expected,
|
|
196
|
+
judgments,
|
|
197
|
+
collection: options.collection ?? entry.collection ?? parsed.collection,
|
|
198
|
+
topK: entry.topK,
|
|
199
|
+
queryModes: entry.queryModes,
|
|
200
|
+
};
|
|
201
|
+
}),
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export async function loadBenchFixture(
|
|
206
|
+
fixturePath: string,
|
|
207
|
+
options: BenchOptions
|
|
208
|
+
): Promise<{ ok: true; fixture: BenchFixture } | { ok: false; error: string }> {
|
|
209
|
+
const file = Bun.file(fixturePath);
|
|
210
|
+
if (!(await file.exists())) {
|
|
211
|
+
return { ok: false, error: `Fixture not found: ${fixturePath}` };
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
let raw: unknown;
|
|
215
|
+
try {
|
|
216
|
+
raw = JSON.parse(await file.text());
|
|
217
|
+
} catch (error) {
|
|
218
|
+
return {
|
|
219
|
+
ok: false,
|
|
220
|
+
error: `Invalid JSON fixture: ${error instanceof Error ? error.message : String(error)}`,
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const parsed = fixtureSchema.safeParse(raw);
|
|
225
|
+
if (!parsed.success) {
|
|
226
|
+
return { ok: false, error: z.prettifyError(parsed.error) };
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
try {
|
|
230
|
+
const fixture = normalizeFixture(parsed.data, options);
|
|
231
|
+
const missingExpected = fixture.queries.find(
|
|
232
|
+
(entry) => entry.expected.length === 0
|
|
233
|
+
);
|
|
234
|
+
if (missingExpected) {
|
|
235
|
+
return {
|
|
236
|
+
ok: false,
|
|
237
|
+
error: `Bench query "${missingExpected.id}" must define expected documents, expected URIs, or judgments`,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
return { ok: true, fixture };
|
|
241
|
+
} catch (error) {
|
|
242
|
+
return {
|
|
243
|
+
ok: false,
|
|
244
|
+
error: error instanceof Error ? error.message : String(error),
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Retrieval benchmark metric helpers.
|
|
3
|
+
*
|
|
4
|
+
* @module src/bench/metrics
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export interface RelevanceJudgment {
|
|
8
|
+
docid: string;
|
|
9
|
+
relevance: number;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface RetrievalMetrics {
|
|
13
|
+
precisionAtK: number;
|
|
14
|
+
recallAtK: number;
|
|
15
|
+
f1AtK: number;
|
|
16
|
+
mrr: number;
|
|
17
|
+
ndcgAtK: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function round(value: number, places = 4): number {
|
|
21
|
+
return Number(value.toFixed(places));
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Compute Precision@K: fraction of retrieved top-K docs that are relevant.
|
|
26
|
+
*/
|
|
27
|
+
export function computePrecision(
|
|
28
|
+
output: string[],
|
|
29
|
+
expected: string[],
|
|
30
|
+
k: number
|
|
31
|
+
): number {
|
|
32
|
+
if (k <= 0) {
|
|
33
|
+
return 0;
|
|
34
|
+
}
|
|
35
|
+
const expectedSet = new Set(expected);
|
|
36
|
+
const hits = output.slice(0, k).filter((docid) => expectedSet.has(docid));
|
|
37
|
+
return hits.length / k;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Compute Recall@K: fraction of relevant docs in top K results.
|
|
42
|
+
*/
|
|
43
|
+
export function computeRecall(
|
|
44
|
+
output: string[],
|
|
45
|
+
expected: string[],
|
|
46
|
+
k: number
|
|
47
|
+
): number {
|
|
48
|
+
if (expected.length === 0) return 1;
|
|
49
|
+
const topK = output.slice(0, k);
|
|
50
|
+
const hits = expected.filter((docid) => topK.includes(docid)).length;
|
|
51
|
+
return hits / expected.length;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Compute F1@K from precision and recall.
|
|
56
|
+
*/
|
|
57
|
+
export function computeF1(precision: number, recall: number): number {
|
|
58
|
+
if (precision === 0 && recall === 0) {
|
|
59
|
+
return 0;
|
|
60
|
+
}
|
|
61
|
+
return (2 * precision * recall) / (precision + recall);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Compute nDCG@K: normalized discounted cumulative gain.
|
|
66
|
+
*/
|
|
67
|
+
export function computeNdcg(
|
|
68
|
+
output: string[],
|
|
69
|
+
judgments: RelevanceJudgment[],
|
|
70
|
+
k: number
|
|
71
|
+
): number {
|
|
72
|
+
if (judgments.length === 0) return 1;
|
|
73
|
+
const relMap = new Map(judgments.map((j) => [j.docid, j.relevance]));
|
|
74
|
+
const dcg = output.slice(0, k).reduce((sum, docid, i) => {
|
|
75
|
+
const rel = relMap.get(docid) ?? 0;
|
|
76
|
+
return sum + (2 ** rel - 1) / Math.log2(i + 2);
|
|
77
|
+
}, 0);
|
|
78
|
+
const idcg = [...judgments]
|
|
79
|
+
.sort((a, b) => b.relevance - a.relevance)
|
|
80
|
+
.slice(0, k)
|
|
81
|
+
.reduce((sum, j, i) => sum + (2 ** j.relevance - 1) / Math.log2(i + 2), 0);
|
|
82
|
+
return idcg > 0 ? dcg / idcg : 1;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Compute Mean Reciprocal Rank (single-query form).
|
|
87
|
+
* Returns reciprocal rank of first relevant hit in output.
|
|
88
|
+
*/
|
|
89
|
+
export function computeMrr(output: string[], expected: string[]): number {
|
|
90
|
+
if (expected.length === 0) {
|
|
91
|
+
return 1;
|
|
92
|
+
}
|
|
93
|
+
const expectedSet = new Set(expected);
|
|
94
|
+
for (const [index, docid] of output.entries()) {
|
|
95
|
+
if (expectedSet.has(docid)) {
|
|
96
|
+
return 1 / (index + 1);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return 0;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export function computeRetrievalMetrics(input: {
|
|
103
|
+
output: string[];
|
|
104
|
+
expected: string[];
|
|
105
|
+
judgments: RelevanceJudgment[];
|
|
106
|
+
k: number;
|
|
107
|
+
}): RetrievalMetrics {
|
|
108
|
+
const precision = computePrecision(input.output, input.expected, input.k);
|
|
109
|
+
const recall = computeRecall(input.output, input.expected, input.k);
|
|
110
|
+
const judgmentSource =
|
|
111
|
+
input.judgments.length > 0
|
|
112
|
+
? input.judgments
|
|
113
|
+
: input.expected.map((docid) => ({ docid, relevance: 1 }));
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
precisionAtK: round(precision),
|
|
117
|
+
recallAtK: round(recall),
|
|
118
|
+
f1AtK: round(computeF1(precision, recall)),
|
|
119
|
+
mrr: round(computeMrr(input.output, input.expected)),
|
|
120
|
+
ndcgAtK: round(computeNdcg(input.output, judgmentSource, input.k)),
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export function averageMetrics(metrics: RetrievalMetrics[]): RetrievalMetrics {
|
|
125
|
+
const average = (values: number[]): number =>
|
|
126
|
+
values.length === 0
|
|
127
|
+
? 0
|
|
128
|
+
: values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
precisionAtK: round(average(metrics.map((m) => m.precisionAtK))),
|
|
132
|
+
recallAtK: round(average(metrics.map((m) => m.recallAtK))),
|
|
133
|
+
f1AtK: round(average(metrics.map((m) => m.f1AtK))),
|
|
134
|
+
mrr: round(average(metrics.map((m) => m.mrr))),
|
|
135
|
+
ndcgAtK: round(average(metrics.map((m) => m.ndcgAtK))),
|
|
136
|
+
};
|
|
137
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { QueryModeInput } from "../pipeline/types";
|
|
2
|
+
import type { RelevanceJudgment, RetrievalMetrics } from "./metrics";
|
|
3
|
+
|
|
4
|
+
export type BenchModeType = "bm25" | "vector" | "hybrid";
|
|
5
|
+
|
|
6
|
+
export interface BenchMode {
|
|
7
|
+
name: string;
|
|
8
|
+
type: BenchModeType;
|
|
9
|
+
depth?: "thorough";
|
|
10
|
+
noExpand?: boolean;
|
|
11
|
+
noRerank?: boolean;
|
|
12
|
+
candidateLimit?: number;
|
|
13
|
+
limit?: number;
|
|
14
|
+
queryModes?: QueryModeInput[];
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface BenchCase {
|
|
18
|
+
id: string;
|
|
19
|
+
query: string;
|
|
20
|
+
expected: string[];
|
|
21
|
+
judgments: RelevanceJudgment[];
|
|
22
|
+
collection?: string;
|
|
23
|
+
topK?: number;
|
|
24
|
+
queryModes?: QueryModeInput[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface BenchFixture {
|
|
28
|
+
version: 1;
|
|
29
|
+
metadata?: {
|
|
30
|
+
name?: string;
|
|
31
|
+
description?: string;
|
|
32
|
+
tags?: string[];
|
|
33
|
+
};
|
|
34
|
+
collection?: string;
|
|
35
|
+
topK: number;
|
|
36
|
+
candidateLimit?: number;
|
|
37
|
+
modes: BenchMode[];
|
|
38
|
+
queries: BenchCase[];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface BenchOptions {
|
|
42
|
+
configPath?: string;
|
|
43
|
+
indexName?: string;
|
|
44
|
+
collection?: string;
|
|
45
|
+
topK?: number;
|
|
46
|
+
candidateLimit?: number;
|
|
47
|
+
modes?: string[];
|
|
48
|
+
json?: boolean;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface BenchCaseResult {
|
|
52
|
+
id: string;
|
|
53
|
+
query: string;
|
|
54
|
+
topK: number;
|
|
55
|
+
expected: string[];
|
|
56
|
+
hits: string[];
|
|
57
|
+
topDocs: string[];
|
|
58
|
+
metrics: RetrievalMetrics;
|
|
59
|
+
latencyMs: number;
|
|
60
|
+
error?: string;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export interface BenchModeResult {
|
|
64
|
+
name: string;
|
|
65
|
+
type: BenchModeType;
|
|
66
|
+
status: "ok" | "failed";
|
|
67
|
+
queryCount: number;
|
|
68
|
+
failures: number;
|
|
69
|
+
metrics: RetrievalMetrics;
|
|
70
|
+
latency: {
|
|
71
|
+
p50Ms: number;
|
|
72
|
+
p95Ms: number;
|
|
73
|
+
meanMs: number;
|
|
74
|
+
};
|
|
75
|
+
cases: BenchCaseResult[];
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export interface BenchOutput {
|
|
79
|
+
fixture: {
|
|
80
|
+
path: string;
|
|
81
|
+
name?: string;
|
|
82
|
+
version: 1;
|
|
83
|
+
queryCount: number;
|
|
84
|
+
topK: number;
|
|
85
|
+
};
|
|
86
|
+
generatedAt: string;
|
|
87
|
+
modes: BenchModeResult[];
|
|
88
|
+
meta: {
|
|
89
|
+
indexName: string;
|
|
90
|
+
collection?: string;
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export type BenchResult =
|
|
95
|
+
| { success: true; data: BenchOutput }
|
|
96
|
+
| { success: false; error: string; isValidation?: boolean };
|