@gmickel/gno 1.3.1 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,280 @@
1
+ /**
2
+ * gno bench command implementation.
3
+ * Runs retrieval benchmarks from user fixtures.
4
+ *
5
+ * @module src/cli/commands/bench
6
+ */
7
+
8
+ import type {
9
+ BenchCase,
10
+ BenchCaseResult,
11
+ BenchMode,
12
+ BenchModeResult,
13
+ BenchOptions,
14
+ BenchResult,
15
+ } from "../../bench/types";
16
+ import type { SearchResult } from "../../pipeline/types";
17
+
18
+ import { loadBenchFixture, normalizeBenchRef } from "../../bench/fixture";
19
+ import { averageMetrics, computeRetrievalMetrics } from "../../bench/metrics";
20
+ import { DEFAULT_THOROUGH_CANDIDATE_LIMIT } from "../../core/depth-policy";
21
+ import { query } from "./query";
22
+ import { search } from "./search";
23
+ import { vsearch } from "./vsearch";
24
+
25
+ function round(value: number, places = 2): number {
26
+ return Number(value.toFixed(places));
27
+ }
28
+
29
+ function summarizeLatency(values: number[]): BenchModeResult["latency"] {
30
+ if (values.length === 0) {
31
+ return { p50Ms: 0, p95Ms: 0, meanMs: 0 };
32
+ }
33
+ const sorted = [...values].sort((a, b) => a - b);
34
+ const percentile = (p: number): number => {
35
+ const index = Math.ceil((p / 100) * sorted.length) - 1;
36
+ return sorted[Math.max(0, Math.min(sorted.length - 1, index))] ?? 0;
37
+ };
38
+ return {
39
+ p50Ms: round(percentile(50)),
40
+ p95Ms: round(percentile(95)),
41
+ meanMs: round(
42
+ values.reduce((sum, value) => sum + value, 0) / values.length
43
+ ),
44
+ };
45
+ }
46
+
47
+ function resultRefs(result: SearchResult): Set<string> {
48
+ return new Set(
49
+ [
50
+ result.docid,
51
+ result.uri,
52
+ normalizeBenchRef(result.uri),
53
+ result.source.relPath,
54
+ result.title,
55
+ ].filter((value): value is string => Boolean(value))
56
+ );
57
+ }
58
+
59
+ function findHits(
60
+ results: SearchResult[],
61
+ expected: string[],
62
+ k: number
63
+ ): string[] {
64
+ const hits: string[] = [];
65
+ const expectedSet = new Set(expected.map(normalizeBenchRef));
66
+
67
+ for (const result of results.slice(0, k)) {
68
+ const refs = resultRefs(result);
69
+ const hit = [...expectedSet].find((expectedRef) => refs.has(expectedRef));
70
+ if (hit && !hits.includes(hit)) {
71
+ hits.push(hit);
72
+ }
73
+ }
74
+
75
+ return hits;
76
+ }
77
+
78
+ function topDocs(results: SearchResult[]): string[] {
79
+ return results.map((result) => result.source.relPath);
80
+ }
81
+
82
+ function rankedMetricDocs(
83
+ results: SearchResult[],
84
+ expected: string[]
85
+ ): string[] {
86
+ const expectedSet = new Set(expected.map(normalizeBenchRef));
87
+ return results.map((result) => {
88
+ const refs = resultRefs(result);
89
+ return (
90
+ [...expectedSet].find((expectedRef) => refs.has(expectedRef)) ??
91
+ result.source.relPath
92
+ );
93
+ });
94
+ }
95
+
96
+ async function runModeCase(input: {
97
+ mode: BenchMode;
98
+ benchCase: BenchCase;
99
+ topK: number;
100
+ candidateLimit?: number;
101
+ options: BenchOptions;
102
+ }): Promise<BenchCaseResult> {
103
+ const { mode, benchCase, topK, options } = input;
104
+ const limit = mode.limit ?? topK;
105
+ const candidateLimit =
106
+ mode.candidateLimit ??
107
+ input.candidateLimit ??
108
+ (mode.depth === "thorough" ? DEFAULT_THOROUGH_CANDIDATE_LIMIT : undefined);
109
+ const startedAt = performance.now();
110
+ const queryModes = benchCase.queryModes ?? mode.queryModes;
111
+ let result:
112
+ | Awaited<ReturnType<typeof search>>
113
+ | Awaited<ReturnType<typeof vsearch>>
114
+ | Awaited<ReturnType<typeof query>>;
115
+
116
+ if (mode.type === "bm25") {
117
+ result = await search(benchCase.query, {
118
+ configPath: options.configPath,
119
+ indexName: options.indexName,
120
+ collection: benchCase.collection,
121
+ limit,
122
+ json: true,
123
+ });
124
+ } else if (mode.type === "vector") {
125
+ result = await vsearch(benchCase.query, {
126
+ configPath: options.configPath,
127
+ indexName: options.indexName,
128
+ collection: benchCase.collection,
129
+ limit,
130
+ json: true,
131
+ });
132
+ } else {
133
+ result = await query(benchCase.query, {
134
+ configPath: options.configPath,
135
+ indexName: options.indexName,
136
+ collection: benchCase.collection,
137
+ limit,
138
+ candidateLimit,
139
+ noExpand: mode.noExpand,
140
+ noRerank: mode.noRerank,
141
+ queryModes,
142
+ json: true,
143
+ });
144
+ }
145
+
146
+ const latencyMs = round(performance.now() - startedAt);
147
+ if (!result.success) {
148
+ return {
149
+ id: benchCase.id,
150
+ query: benchCase.query,
151
+ topK,
152
+ expected: benchCase.expected,
153
+ hits: [],
154
+ topDocs: [],
155
+ metrics: computeRetrievalMetrics({
156
+ output: [],
157
+ expected: benchCase.expected,
158
+ judgments: benchCase.judgments,
159
+ k: topK,
160
+ }),
161
+ latencyMs,
162
+ error: result.error,
163
+ };
164
+ }
165
+
166
+ const docs = topDocs(result.data.results);
167
+ const metricDocs = rankedMetricDocs(result.data.results, benchCase.expected);
168
+ const hits = findHits(result.data.results, benchCase.expected, topK);
169
+ return {
170
+ id: benchCase.id,
171
+ query: benchCase.query,
172
+ topK,
173
+ expected: benchCase.expected,
174
+ hits,
175
+ topDocs: docs.slice(0, topK),
176
+ metrics: computeRetrievalMetrics({
177
+ output: metricDocs,
178
+ expected: benchCase.expected,
179
+ judgments: benchCase.judgments,
180
+ k: topK,
181
+ }),
182
+ latencyMs,
183
+ };
184
+ }
185
+
186
+ /**
187
+ * Execute gno bench command.
188
+ */
189
+ export async function bench(
190
+ fixturePath: string,
191
+ options: BenchOptions = {}
192
+ ): Promise<BenchResult> {
193
+ const loaded = await loadBenchFixture(fixturePath, options);
194
+ if (!loaded.ok) {
195
+ return { success: false, error: loaded.error, isValidation: true };
196
+ }
197
+
198
+ const { fixture } = loaded;
199
+ const modeResults: BenchModeResult[] = [];
200
+
201
+ for (const mode of fixture.modes) {
202
+ const cases: BenchCaseResult[] = [];
203
+ for (const benchCase of fixture.queries) {
204
+ const topK = benchCase.topK ?? fixture.topK;
205
+ cases.push(
206
+ await runModeCase({
207
+ mode,
208
+ benchCase,
209
+ topK,
210
+ candidateLimit: fixture.candidateLimit,
211
+ options,
212
+ })
213
+ );
214
+ }
215
+
216
+ const failures = cases.filter((entry) => entry.error).length;
217
+ modeResults.push({
218
+ name: mode.name,
219
+ type: mode.type,
220
+ status: failures === cases.length ? "failed" : "ok",
221
+ queryCount: cases.length,
222
+ failures,
223
+ metrics: averageMetrics(cases.map((entry) => entry.metrics)),
224
+ latency: summarizeLatency(cases.map((entry) => entry.latencyMs)),
225
+ cases,
226
+ });
227
+ }
228
+
229
+ return {
230
+ success: true,
231
+ data: {
232
+ fixture: {
233
+ path: fixturePath,
234
+ name: fixture.metadata?.name,
235
+ version: fixture.version,
236
+ queryCount: fixture.queries.length,
237
+ topK: fixture.topK,
238
+ },
239
+ generatedAt: new Date().toISOString(),
240
+ modes: modeResults,
241
+ meta: {
242
+ indexName: options.indexName ?? "default",
243
+ collection: fixture.collection,
244
+ },
245
+ },
246
+ };
247
+ }
248
+
249
+ export function formatBench(
250
+ result: BenchResult,
251
+ options: { json?: boolean }
252
+ ): string {
253
+ if (!result.success) {
254
+ return options.json
255
+ ? JSON.stringify({
256
+ error: { code: "BENCH_FAILED", message: result.error },
257
+ })
258
+ : `Error: ${result.error}`;
259
+ }
260
+
261
+ if (options.json) {
262
+ return JSON.stringify(result.data, null, 2);
263
+ }
264
+
265
+ const lines = [
266
+ `Bench: ${result.data.fixture.name ?? result.data.fixture.path}`,
267
+ `Queries: ${result.data.fixture.queryCount} Top K: ${result.data.fixture.topK}`,
268
+ "",
269
+ "| Mode | Status | Precision@K | Recall@K | F1@K | MRR | nDCG@K | p95 ms | Failures |",
270
+ "| ---- | ------ | ----------- | -------- | ---- | --- | ------ | ------ | -------- |",
271
+ ];
272
+
273
+ for (const mode of result.data.modes) {
274
+ lines.push(
275
+ `| ${mode.name} | ${mode.status} | ${mode.metrics.precisionAtK.toFixed(3)} | ${mode.metrics.recallAtK.toFixed(3)} | ${mode.metrics.f1AtK.toFixed(3)} | ${mode.metrics.mrr.toFixed(3)} | ${mode.metrics.ndcgAtK.toFixed(3)} | ${mode.latency.p95Ms.toFixed(2)} | ${mode.failures} |`
276
+ );
277
+ }
278
+
279
+ return lines.join("\n");
280
+ }
@@ -268,8 +268,11 @@ async function checkSqliteExtensions(): Promise<DoctorCheck[]> {
268
268
 
269
269
  let vecMessage: string;
270
270
  if (sqliteVecAvailable) {
271
+ const formattedVersion = sqliteVecVersion.startsWith("v")
272
+ ? sqliteVecVersion
273
+ : `v${sqliteVecVersion}`;
271
274
  vecMessage = sqliteVecVersion
272
- ? `sqlite-vec loaded (v${sqliteVecVersion})`
275
+ ? `sqlite-vec loaded (${formattedVersion})`
273
276
  : "sqlite-vec loaded";
274
277
  } else if (mode === "unavailable") {
275
278
  vecMessage =
@@ -22,6 +22,7 @@ export const CMD = {
22
22
  search: "search",
23
23
  vsearch: "vsearch",
24
24
  query: "query",
25
+ bench: "bench",
25
26
  ask: "ask",
26
27
  get: "get",
27
28
  multiGet: "multi-get",
@@ -45,6 +46,7 @@ const FORMAT_SUPPORT: Record<CommandId, OutputFormat[]> = {
45
46
  [CMD.search]: ["terminal", "json", "files", "csv", "md", "xml"],
46
47
  [CMD.vsearch]: ["terminal", "json", "files", "csv", "md", "xml"],
47
48
  [CMD.query]: ["terminal", "json", "files", "csv", "md", "xml"],
49
+ [CMD.bench]: ["terminal", "json"],
48
50
  [CMD.ask]: ["terminal", "json", "md"],
49
51
  [CMD.get]: ["terminal", "json", "md"],
50
52
  [CMD.multiGet]: ["terminal", "json", "files", "md"],
@@ -677,6 +677,58 @@ function wireSearchCommands(program: Command): void {
677
677
  await writeOutput(output, format);
678
678
  });
679
679
 
680
+ // bench - Retrieval benchmark fixture runner
681
+ program
682
+ .command("bench <fixture>")
683
+ .description("Run retrieval quality benchmarks from a fixture")
684
+ .option("-c, --collection <name>", "override fixture collection")
685
+ .option("-k, --top-k <num>", "override top-k metric cutoff")
686
+ .option(
687
+ "--mode <name>",
688
+ "benchmark mode (repeatable): bm25, vector, hybrid, fast, no-rerank, thorough",
689
+ (value: string, previous: string[] = []) => [...previous, value],
690
+ []
691
+ )
692
+ .option("-C, --candidate-limit <num>", "max candidates passed to reranking")
693
+ .option("--json", "JSON output")
694
+ .action(async (fixture: string, cmdOpts: Record<string, unknown>) => {
695
+ const format = getFormat(cmdOpts);
696
+ assertFormatSupported(CMD.bench, format);
697
+ const globals = getGlobals();
698
+ const topK = cmdOpts.topK
699
+ ? parsePositiveInt("top-k", cmdOpts.topK)
700
+ : undefined;
701
+ const candidateLimit = cmdOpts.candidateLimit
702
+ ? parsePositiveInt("candidate-limit", cmdOpts.candidateLimit)
703
+ : undefined;
704
+
705
+ const { bench, formatBench } = await import("./commands/bench");
706
+ const result = await bench(fixture, {
707
+ configPath: globals.config,
708
+ indexName: globals.index,
709
+ collection: cmdOpts.collection as string | undefined,
710
+ topK,
711
+ candidateLimit,
712
+ modes:
713
+ Array.isArray(cmdOpts.mode) && cmdOpts.mode.length > 0
714
+ ? (cmdOpts.mode as string[])
715
+ : undefined,
716
+ json: format === "json",
717
+ });
718
+
719
+ if (!result.success) {
720
+ throw new CliError(
721
+ result.isValidation ? "VALIDATION" : "RUNTIME",
722
+ result.error
723
+ );
724
+ }
725
+
726
+ await writeOutput(
727
+ formatBench(result, { json: format === "json" }),
728
+ format
729
+ );
730
+ });
731
+
680
732
  // ask - Human-friendly query with grounded answer
681
733
  program
682
734
  .command("ask <query>")
@@ -53,6 +53,20 @@ export function normalizeTagFilters(tags?: string[]): string[] | undefined {
53
53
  return [...new Set(tags.map(normalizeTag))];
54
54
  }
55
55
 
56
+ export const MCP_TOOL_DESCRIPTIONS = {
57
+ search:
58
+ "BM25 keyword search. Fast exact-term lookup for names, identifiers, error text, and known phrases. Results include uri/docid and line when available; use gno_get with fromLine/lineCount or gno_multi_get for full context. Use gno_query when wording is uncertain.",
59
+ vsearch:
60
+ "Vector semantic search. Finds conceptually similar docs with different wording. Best after embeddings are current; use intent to disambiguate short terms. Use gno_query for default hybrid retrieval.",
61
+ query:
62
+ "Hybrid search (BM25 + vector + optional expansion/reranking). Recommended default. Use intent for ambiguous terms, queryModes to combine term/intent/hyde strategies, fast=true for quick lookup, thorough=true when recall matters, and candidateLimit to trade latency for coverage.",
63
+ get: "Retrieve one document by gno:// URI, docid (#abc123), or collection/path. After search results include line, pass fromLine and lineCount to fetch only the relevant range before expanding to the full document.",
64
+ multiGet:
65
+ "Retrieve multiple documents by refs array or glob pattern. Use after gno_search/gno_query to batch top result URIs/docids; set maxBytes and lineNumbers to control context size.",
66
+ status:
67
+ "Get index health: collection count, document count, chunk count, embedding backlog, and per-collection stats. Check first when vector/hybrid results look stale or unavailable.",
68
+ } as const;
69
+
56
70
  // ─────────────────────────────────────────────────────────────────────────────
57
71
  // Shared Input Schemas
58
72
  // ─────────────────────────────────────────────────────────────────────────────
@@ -61,7 +75,9 @@ const searchInputSchema = z.object({
61
75
  query: z
62
76
  .string()
63
77
  .min(1, "Query cannot be empty")
64
- .describe("Search query text"),
78
+ .describe(
79
+ "Exact keyword, identifier, filename, error text, or phrase to match with BM25"
80
+ ),
65
81
  collection: z
66
82
  .string()
67
83
  .optional()
@@ -89,7 +105,7 @@ const searchInputSchema = z.object({
89
105
  .string()
90
106
  .optional()
91
107
  .describe(
92
- "Disambiguating context for ambiguous queries (e.g. 'programming language' when query is 'python')"
108
+ "Disambiguating context for ambiguous queries; not searched directly (e.g. 'programming language' when query is 'python')"
93
109
  ),
94
110
  exclude: z
95
111
  .array(z.string())
@@ -274,7 +290,9 @@ const vsearchInputSchema = z.object({
274
290
  query: z
275
291
  .string()
276
292
  .min(1, "Query cannot be empty")
277
- .describe("Search query text (matched by semantic meaning, not keywords)"),
293
+ .describe(
294
+ "Natural-language concept to match semantically; use gno_search for exact error text or identifiers"
295
+ ),
278
296
  collection: z
279
297
  .string()
280
298
  .optional()
@@ -299,7 +317,9 @@ const vsearchInputSchema = z.object({
299
317
  intent: z
300
318
  .string()
301
319
  .optional()
302
- .describe("Disambiguating context for the query"),
320
+ .describe(
321
+ "Disambiguating context for ambiguous terms; steers snippet choice without becoming the searched text"
322
+ ),
303
323
  exclude: z
304
324
  .array(z.string())
305
325
  .optional()
@@ -325,20 +345,24 @@ const queryModeInputSchema = z.object({
325
345
  mode: z
326
346
  .enum(["term", "intent", "hyde"])
327
347
  .describe(
328
- "Retrieval strategy: 'term' (keyword match), 'intent' (disambiguation), 'hyde' (hypothetical document for semantic matching)"
348
+ "Retrieval strategy: 'term' for exact lexical anchors, 'intent' for disambiguation, 'hyde' for one hypothetical answer/document to improve semantic matching"
329
349
  ),
330
350
  text: z
331
351
  .string()
332
352
  .trim()
333
353
  .min(1, "Query mode text cannot be empty")
334
- .describe("Text for this query mode"),
354
+ .describe(
355
+ "Text for this query mode; keep term modes concise and hyde modes answer-shaped"
356
+ ),
335
357
  });
336
358
 
337
359
  export const queryInputSchema = z.object({
338
360
  query: z
339
361
  .string()
340
362
  .min(1, "Query cannot be empty")
341
- .describe("Search query text"),
363
+ .describe(
364
+ "Primary user query; combine with intent or queryModes for ambiguous requests"
365
+ ),
342
366
  collection: z
343
367
  .string()
344
368
  .optional()
@@ -366,7 +390,7 @@ export const queryInputSchema = z.object({
366
390
  .string()
367
391
  .optional()
368
392
  .describe(
369
- "Disambiguating context (e.g. 'programming language' when query is 'python')"
393
+ "Disambiguating context (e.g. 'programming language' when query is 'python'); steers expansion, rerank, and snippet choice"
370
394
  ),
371
395
  candidateLimit: z
372
396
  .number()
@@ -375,7 +399,7 @@ export const queryInputSchema = z.object({
375
399
  .max(100)
376
400
  .optional()
377
401
  .describe(
378
- "Max candidates passed to reranking stage (higher = better recall, slower)"
402
+ "Max candidates passed to reranking stage; raise when top results miss relevant docs, lower for latency"
379
403
  ),
380
404
  exclude: z
381
405
  .array(z.string())
@@ -397,7 +421,7 @@ export const queryInputSchema = z.object({
397
421
  queryModes: z
398
422
  .array(queryModeInputSchema)
399
423
  .describe(
400
- "Structured query modes to combine multiple retrieval strategies. Max one 'hyde' entry."
424
+ "Structured query modes for typed retrieval: combine term anchors, intent disambiguation, and at most one hyde hypothetical document"
401
425
  )
402
426
  .superRefine((entries, ctx) => {
403
427
  const hydeCount = entries.filter((entry) => entry.mode === "hyde").length;
@@ -417,7 +441,7 @@ export const queryInputSchema = z.object({
417
441
  .boolean()
418
442
  .default(false)
419
443
  .describe(
420
- "Enable query expansion for best recall (~5-8s). Use when default returns no results"
444
+ "Enable query expansion for best recall (~5-8s). Use for broad research or when default results miss likely docs"
421
445
  ),
422
446
  expand: z
423
447
  .boolean()
@@ -443,13 +467,17 @@ const getInputSchema = z.object({
443
467
  .int()
444
468
  .min(1)
445
469
  .optional()
446
- .describe("Start reading from this line number"),
470
+ .describe(
471
+ "Start reading from this line number; use the line returned by search/query results"
472
+ ),
447
473
  lineCount: z
448
474
  .number()
449
475
  .int()
450
476
  .min(1)
451
477
  .optional()
452
- .describe("Number of lines to return (from fromLine)"),
478
+ .describe(
479
+ "Number of lines to return from fromLine; prefer a small range before fetching full docs"
480
+ ),
453
481
  lineNumbers: z
454
482
  .boolean()
455
483
  .default(true)
@@ -461,7 +489,9 @@ const multiGetInputSchema = z.object({
461
489
  .array(z.string())
462
490
  .min(1)
463
491
  .optional()
464
- .describe("Array of document references (URIs or docids)"),
492
+ .describe(
493
+ "Array of document references from search/query results (gno:// URIs or docids)"
494
+ ),
465
495
  pattern: z
466
496
  .string()
467
497
  .optional()
@@ -471,7 +501,9 @@ const multiGetInputSchema = z.object({
471
501
  .int()
472
502
  .min(1)
473
503
  .default(10_240)
474
- .describe("Max bytes per document (truncates longer docs)"),
504
+ .describe(
505
+ "Max bytes per document; lower this when batching many top search results"
506
+ ),
475
507
  lineNumbers: z
476
508
  .boolean()
477
509
  .default(true)
@@ -717,42 +749,42 @@ export function registerTools(server: McpServer, ctx: ToolContext): void {
717
749
  // Tool IDs use underscores (MCP pattern: ^[a-zA-Z0-9_-]{1,64}$)
718
750
  server.tool(
719
751
  "gno_search",
720
- "BM25 keyword search. Instant, best for exact terms. Use gno_query for better quality.",
752
+ MCP_TOOL_DESCRIPTIONS.search,
721
753
  searchInputSchema.shape,
722
754
  (args) => handleSearch(args, ctx)
723
755
  );
724
756
 
725
757
  server.tool(
726
758
  "gno_vsearch",
727
- "Vector semantic search. Finds conceptually similar docs even with different wording. Use gno_query for best results.",
759
+ MCP_TOOL_DESCRIPTIONS.vsearch,
728
760
  vsearchInputSchema.shape,
729
761
  (args) => handleVsearch(args, ctx)
730
762
  );
731
763
 
732
764
  server.tool(
733
765
  "gno_query",
734
- "Hybrid search (BM25 + vector + reranking). Best quality, recommended default. Use fast=true for speed, thorough=true for best recall.",
766
+ MCP_TOOL_DESCRIPTIONS.query,
735
767
  queryInputSchema.shape,
736
768
  (args) => handleQuery(args, ctx)
737
769
  );
738
770
 
739
771
  server.tool(
740
772
  "gno_get",
741
- "Retrieve a single document's full content by URI (gno://collection/path), docid (#abc123), or collection/path.",
773
+ MCP_TOOL_DESCRIPTIONS.get,
742
774
  getInputSchema.shape,
743
775
  (args) => handleGet(args, ctx)
744
776
  );
745
777
 
746
778
  server.tool(
747
779
  "gno_multi_get",
748
- "Retrieve multiple documents by refs array or glob pattern. Use maxBytes to control truncation.",
780
+ MCP_TOOL_DESCRIPTIONS.multiGet,
749
781
  multiGetInputSchema.shape,
750
782
  (args) => handleMultiGet(args, ctx)
751
783
  );
752
784
 
753
785
  server.tool(
754
786
  "gno_status",
755
- "Get index health: collection count, document count, chunk count, embedding backlog, and per-collection stats.",
787
+ MCP_TOOL_DESCRIPTIONS.status,
756
788
  statusInputSchema.shape,
757
789
  (args) => handleStatus(args, ctx)
758
790
  );