@gmickel/gno 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +256 -0
  2. package/assets/skill/SKILL.md +112 -0
  3. package/assets/skill/cli-reference.md +327 -0
  4. package/assets/skill/examples.md +234 -0
  5. package/assets/skill/mcp-reference.md +159 -0
  6. package/package.json +90 -0
  7. package/src/app/constants.ts +313 -0
  8. package/src/cli/colors.ts +65 -0
  9. package/src/cli/commands/ask.ts +545 -0
  10. package/src/cli/commands/cleanup.ts +105 -0
  11. package/src/cli/commands/collection/add.ts +120 -0
  12. package/src/cli/commands/collection/index.ts +10 -0
  13. package/src/cli/commands/collection/list.ts +108 -0
  14. package/src/cli/commands/collection/remove.ts +64 -0
  15. package/src/cli/commands/collection/rename.ts +95 -0
  16. package/src/cli/commands/context/add.ts +67 -0
  17. package/src/cli/commands/context/check.ts +153 -0
  18. package/src/cli/commands/context/index.ts +10 -0
  19. package/src/cli/commands/context/list.ts +109 -0
  20. package/src/cli/commands/context/rm.ts +52 -0
  21. package/src/cli/commands/doctor.ts +393 -0
  22. package/src/cli/commands/embed.ts +462 -0
  23. package/src/cli/commands/get.ts +356 -0
  24. package/src/cli/commands/index-cmd.ts +119 -0
  25. package/src/cli/commands/index.ts +102 -0
  26. package/src/cli/commands/init.ts +328 -0
  27. package/src/cli/commands/ls.ts +217 -0
  28. package/src/cli/commands/mcp/config.ts +300 -0
  29. package/src/cli/commands/mcp/index.ts +24 -0
  30. package/src/cli/commands/mcp/install.ts +203 -0
  31. package/src/cli/commands/mcp/paths.ts +470 -0
  32. package/src/cli/commands/mcp/status.ts +222 -0
  33. package/src/cli/commands/mcp/uninstall.ts +158 -0
  34. package/src/cli/commands/mcp.ts +20 -0
  35. package/src/cli/commands/models/clear.ts +103 -0
  36. package/src/cli/commands/models/index.ts +32 -0
  37. package/src/cli/commands/models/list.ts +214 -0
  38. package/src/cli/commands/models/path.ts +51 -0
  39. package/src/cli/commands/models/pull.ts +199 -0
  40. package/src/cli/commands/models/use.ts +85 -0
  41. package/src/cli/commands/multi-get.ts +400 -0
  42. package/src/cli/commands/query.ts +220 -0
  43. package/src/cli/commands/ref-parser.ts +108 -0
  44. package/src/cli/commands/reset.ts +191 -0
  45. package/src/cli/commands/search.ts +136 -0
  46. package/src/cli/commands/shared.ts +156 -0
  47. package/src/cli/commands/skill/index.ts +19 -0
  48. package/src/cli/commands/skill/install.ts +197 -0
  49. package/src/cli/commands/skill/paths-cmd.ts +81 -0
  50. package/src/cli/commands/skill/paths.ts +191 -0
  51. package/src/cli/commands/skill/show.ts +73 -0
  52. package/src/cli/commands/skill/uninstall.ts +141 -0
  53. package/src/cli/commands/status.ts +205 -0
  54. package/src/cli/commands/update.ts +68 -0
  55. package/src/cli/commands/vsearch.ts +188 -0
  56. package/src/cli/context.ts +64 -0
  57. package/src/cli/errors.ts +64 -0
  58. package/src/cli/format/search-results.ts +211 -0
  59. package/src/cli/options.ts +183 -0
  60. package/src/cli/program.ts +1330 -0
  61. package/src/cli/run.ts +213 -0
  62. package/src/cli/ui.ts +92 -0
  63. package/src/config/defaults.ts +20 -0
  64. package/src/config/index.ts +55 -0
  65. package/src/config/loader.ts +161 -0
  66. package/src/config/paths.ts +87 -0
  67. package/src/config/saver.ts +153 -0
  68. package/src/config/types.ts +280 -0
  69. package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
  70. package/src/converters/adapters/officeparser/adapter.ts +126 -0
  71. package/src/converters/canonicalize.ts +89 -0
  72. package/src/converters/errors.ts +218 -0
  73. package/src/converters/index.ts +51 -0
  74. package/src/converters/mime.ts +163 -0
  75. package/src/converters/native/markdown.ts +115 -0
  76. package/src/converters/native/plaintext.ts +56 -0
  77. package/src/converters/path.ts +48 -0
  78. package/src/converters/pipeline.ts +159 -0
  79. package/src/converters/registry.ts +74 -0
  80. package/src/converters/types.ts +123 -0
  81. package/src/converters/versions.ts +24 -0
  82. package/src/index.ts +27 -0
  83. package/src/ingestion/chunker.ts +238 -0
  84. package/src/ingestion/index.ts +32 -0
  85. package/src/ingestion/language.ts +276 -0
  86. package/src/ingestion/sync.ts +671 -0
  87. package/src/ingestion/types.ts +219 -0
  88. package/src/ingestion/walker.ts +235 -0
  89. package/src/llm/cache.ts +467 -0
  90. package/src/llm/errors.ts +191 -0
  91. package/src/llm/index.ts +58 -0
  92. package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
  93. package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
  94. package/src/llm/nodeLlamaCpp/generation.ts +88 -0
  95. package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
  96. package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
  97. package/src/llm/registry.ts +86 -0
  98. package/src/llm/types.ts +129 -0
  99. package/src/mcp/resources/index.ts +151 -0
  100. package/src/mcp/server.ts +229 -0
  101. package/src/mcp/tools/get.ts +220 -0
  102. package/src/mcp/tools/index.ts +160 -0
  103. package/src/mcp/tools/multi-get.ts +263 -0
  104. package/src/mcp/tools/query.ts +226 -0
  105. package/src/mcp/tools/search.ts +119 -0
  106. package/src/mcp/tools/status.ts +81 -0
  107. package/src/mcp/tools/vsearch.ts +198 -0
  108. package/src/pipeline/chunk-lookup.ts +44 -0
  109. package/src/pipeline/expansion.ts +256 -0
  110. package/src/pipeline/explain.ts +115 -0
  111. package/src/pipeline/fusion.ts +185 -0
  112. package/src/pipeline/hybrid.ts +535 -0
  113. package/src/pipeline/index.ts +64 -0
  114. package/src/pipeline/query-language.ts +118 -0
  115. package/src/pipeline/rerank.ts +223 -0
  116. package/src/pipeline/search.ts +261 -0
  117. package/src/pipeline/types.ts +328 -0
  118. package/src/pipeline/vsearch.ts +348 -0
  119. package/src/store/index.ts +41 -0
  120. package/src/store/migrations/001-initial.ts +196 -0
  121. package/src/store/migrations/index.ts +20 -0
  122. package/src/store/migrations/runner.ts +187 -0
  123. package/src/store/sqlite/adapter.ts +1242 -0
  124. package/src/store/sqlite/index.ts +7 -0
  125. package/src/store/sqlite/setup.ts +129 -0
  126. package/src/store/sqlite/types.ts +28 -0
  127. package/src/store/types.ts +506 -0
  128. package/src/store/vector/index.ts +13 -0
  129. package/src/store/vector/sqlite-vec.ts +373 -0
  130. package/src/store/vector/stats.ts +152 -0
  131. package/src/store/vector/types.ts +115 -0
@@ -0,0 +1,118 @@
1
+ /**
2
+ * Query language detection for prompt selection.
3
+ *
4
+ * IMPORTANT: This affects prompt selection and metadata ONLY.
5
+ * It does NOT affect retrieval filtering - that's controlled by CLI --lang flag.
6
+ */
7
+ import { franc } from 'franc';
8
+
9
+ const MIN_RELIABLE_LENGTH = 15;
10
+
11
+ /**
12
+ * Supported languages for detection.
13
+ * Maps ISO 639-3 codes to BCP-47 (ISO 639-1) codes.
14
+ *
15
+ * Selection criteria:
16
+ * - Major world languages by speaker count
17
+ * - Significant tech/documentation communities
18
+ * - Linguistically distinct (to minimize false positives)
19
+ */
20
+ const LANG_MAP = {
21
+ // Western European (Germanic)
22
+ eng: 'en', // English
23
+ deu: 'de', // German
24
+ nld: 'nl', // Dutch
25
+
26
+ // Western European (Romance)
27
+ fra: 'fr', // French
28
+ ita: 'it', // Italian
29
+ spa: 'es', // Spanish
30
+ por: 'pt', // Portuguese
31
+ cat: 'ca', // Catalan
32
+ ron: 'ro', // Romanian
33
+
34
+ // Scandinavian
35
+ swe: 'sv', // Swedish
36
+ dan: 'da', // Danish
37
+ nob: 'nb', // Norwegian Bokmål
38
+ nno: 'nn', // Norwegian Nynorsk
39
+ fin: 'fi', // Finnish
40
+
41
+ // Eastern European
42
+ pol: 'pl', // Polish
43
+ ces: 'cs', // Czech
44
+ slk: 'sk', // Slovak
45
+ rus: 'ru', // Russian
46
+ ukr: 'uk', // Ukrainian
47
+ bul: 'bg', // Bulgarian
48
+ hrv: 'hr', // Croatian
49
+ ell: 'el', // Greek
50
+ hun: 'hu', // Hungarian
51
+
52
+ // Middle Eastern
53
+ tur: 'tr', // Turkish
54
+ ara: 'ar', // Arabic
55
+ heb: 'he', // Hebrew
56
+ fas: 'fa', // Persian/Farsi
57
+
58
+ // South Asian
59
+ hin: 'hi', // Hindi
60
+
61
+ // Southeast Asian
62
+ vie: 'vi', // Vietnamese
63
+ tha: 'th', // Thai
64
+ ind: 'id', // Indonesian
65
+
66
+ // East Asian
67
+ cmn: 'zh', // Mandarin Chinese
68
+ jpn: 'ja', // Japanese
69
+ kor: 'ko', // Korean
70
+ } as const;
71
+
72
+ /** ISO 639-3 codes for franc's only filter */
73
+ const SUPPORTED_LANGUAGES = Object.keys(LANG_MAP);
74
+
75
+ export interface LanguageDetection {
76
+ /** BCP-47 code: 'en', 'de', 'fr', etc. 'und' if undetermined */
77
+ bcp47: string;
78
+ /** ISO 639-3 code: 'eng', 'deu', 'fra', etc. 'und' if undetermined */
79
+ iso639_3: string;
80
+ /** false if text too short or language undetermined */
81
+ confident: boolean;
82
+ }
83
+
84
+ /**
85
+ * Detect the language of query text for prompt selection.
86
+ *
87
+ * @param text - Query text to analyze
88
+ * @returns Language detection result with BCP-47 code and confidence
89
+ *
90
+ * @example
91
+ * detectQueryLanguage("wie konfiguriere ich kubernetes")
92
+ * // { bcp47: 'de', iso639_3: 'deu', confident: true }
93
+ *
94
+ * detectQueryLanguage("hello")
95
+ * // { bcp47: 'und', iso639_3: 'und', confident: false } // too short
96
+ */
97
+ export function detectQueryLanguage(text: string): LanguageDetection {
98
+ const trimmed = text.trim();
99
+
100
+ if (trimmed.length < MIN_RELIABLE_LENGTH) {
101
+ return { bcp47: 'und', iso639_3: 'und', confident: false };
102
+ }
103
+
104
+ const detected = franc(trimmed, {
105
+ minLength: MIN_RELIABLE_LENGTH,
106
+ only: SUPPORTED_LANGUAGES,
107
+ });
108
+
109
+ if (detected === 'und') {
110
+ return { bcp47: 'und', iso639_3: 'und', confident: false };
111
+ }
112
+
113
+ const bcp47 = LANG_MAP[detected as keyof typeof LANG_MAP];
114
+ if (!bcp47) {
115
+ return { bcp47: 'und', iso639_3: 'und', confident: false };
116
+ }
117
+ return { bcp47, iso639_3: detected, confident: true };
118
+ }
@@ -0,0 +1,223 @@
1
+ /**
2
+ * Reranking and position-aware blending.
3
+ * Uses RerankPort to reorder candidates.
4
+ *
5
+ * @module src/pipeline/rerank
6
+ */
7
+
8
+ import type { RerankPort } from '../llm/types';
9
+ import type { StorePort } from '../store/types';
10
+ import { createChunkLookup } from './chunk-lookup';
11
+ import type { BlendingTier, FusionCandidate, RerankedCandidate } from './types';
12
+ import { DEFAULT_BLENDING_SCHEDULE } from './types';
13
+
14
+ // ─────────────────────────────────────────────────────────────────────────────
15
+ // Types
16
+ // ─────────────────────────────────────────────────────────────────────────────
17
+
18
+ export interface RerankOptions {
19
+ /** Max candidates to rerank */
20
+ maxCandidates?: number;
21
+ /** Blending schedule */
22
+ blendingSchedule?: BlendingTier[];
23
+ }
24
+
25
+ export interface RerankResult {
26
+ candidates: RerankedCandidate[];
27
+ reranked: boolean;
28
+ }
29
+
30
+ export interface RerankDeps {
31
+ rerankPort: RerankPort | null;
32
+ store: StorePort;
33
+ }
34
+
35
+ // ─────────────────────────────────────────────────────────────────────────────
36
+ // Blending
37
+ // ─────────────────────────────────────────────────────────────────────────────
38
+
39
+ /**
40
+ * Get blending weights for a position.
41
+ */
42
+ function getBlendingWeights(
43
+ position: number,
44
+ schedule: BlendingTier[]
45
+ ): { fusionWeight: number; rerankWeight: number } {
46
+ const tier = schedule.find((t) => position <= t.maxRank);
47
+ if (tier) {
48
+ return { fusionWeight: tier.fusionWeight, rerankWeight: tier.rerankWeight };
49
+ }
50
+ // Fallback to last tier
51
+ const last = schedule.at(-1);
52
+ return last
53
+ ? { fusionWeight: last.fusionWeight, rerankWeight: last.rerankWeight }
54
+ : { fusionWeight: 0.5, rerankWeight: 0.5 };
55
+ }
56
+
57
+ /**
58
+ * Blend fusion and rerank scores.
59
+ */
60
+ function blend(
61
+ fusionScore: number,
62
+ rerankScore: number,
63
+ position: number,
64
+ schedule: BlendingTier[]
65
+ ): number {
66
+ const { fusionWeight, rerankWeight } = getBlendingWeights(position, schedule);
67
+ return fusionWeight * fusionScore + rerankWeight * rerankScore;
68
+ }
69
+
70
+ // ─────────────────────────────────────────────────────────────────────────────
71
+ // Rerank Implementation
72
+ // ─────────────────────────────────────────────────────────────────────────────
73
+
74
+ /**
75
+ * Rerank candidates using cross-encoder.
76
+ * Falls back to fusion-only if reranking fails.
77
+ */
78
+ export async function rerankCandidates(
79
+ deps: RerankDeps,
80
+ query: string,
81
+ candidates: FusionCandidate[],
82
+ options: RerankOptions = {}
83
+ ): Promise<RerankResult> {
84
+ // Early return for empty candidates
85
+ if (candidates.length === 0) {
86
+ return { candidates: [], reranked: false };
87
+ }
88
+
89
+ const { rerankPort, store } = deps;
90
+ const maxCandidates = options.maxCandidates ?? 20;
91
+ const schedule = options.blendingSchedule ?? DEFAULT_BLENDING_SCHEDULE;
92
+
93
+ // Normalize fusion scores to 0-1 range across ALL candidates for stability.
94
+ // This ensures blendedScore is always in [0,1] regardless of reranker availability.
95
+ const fusionScoresAll = candidates.map((c) => c.fusionScore);
96
+ const minFusionAll = Math.min(...fusionScoresAll);
97
+ const maxFusionAll = Math.max(...fusionScoresAll);
98
+ const fusionRangeAll = maxFusionAll - minFusionAll;
99
+
100
+ function normalizeFusionScore(score: number): number {
101
+ if (fusionRangeAll < 1e-9) {
102
+ return 1; // tie for best
103
+ }
104
+ const v = (score - minFusionAll) / fusionRangeAll;
105
+ return Math.max(0, Math.min(1, v));
106
+ }
107
+
108
+ // If no reranker, return candidates with normalized fusion scores
109
+ if (!rerankPort) {
110
+ return {
111
+ candidates: candidates.map((c) => ({
112
+ ...c,
113
+ rerankScore: null,
114
+ blendedScore: normalizeFusionScore(c.fusionScore),
115
+ })),
116
+ reranked: false,
117
+ };
118
+ }
119
+
120
+ // Limit candidates for reranking
121
+ const toRerank = candidates.slice(0, maxCandidates);
122
+ const remaining = candidates.slice(maxCandidates);
123
+
124
+ // Pre-fetch all chunks in one batch query (eliminates N+1)
125
+ const uniqueHashes = [...new Set(toRerank.map((c) => c.mirrorHash))];
126
+ const chunksMapResult = await store.getChunksBatch(uniqueHashes);
127
+
128
+ // If chunk fetch fails, degrade gracefully (fusion-only)
129
+ // Don't rerank on empty/missing texts - produces non-deterministic results
130
+ if (!chunksMapResult.ok) {
131
+ return {
132
+ candidates: candidates.map((c) => ({
133
+ ...c,
134
+ rerankScore: null,
135
+ blendedScore: normalizeFusionScore(c.fusionScore),
136
+ })),
137
+ reranked: false,
138
+ };
139
+ }
140
+ const chunksMap = chunksMapResult.value;
141
+ const getChunk = createChunkLookup(chunksMap);
142
+
143
+ // Build texts array for reranking (O(1) lookup per candidate)
144
+ const texts: string[] = toRerank.map((c) => {
145
+ const chunk = getChunk(c.mirrorHash, c.seq);
146
+ return chunk?.text ?? '';
147
+ });
148
+
149
+ // Run reranking
150
+ const rerankResult = await rerankPort.rerank(query, texts);
151
+
152
+ if (!rerankResult.ok) {
153
+ // Graceful degradation - return normalized fusion scores
154
+ return {
155
+ candidates: candidates.map((c) => ({
156
+ ...c,
157
+ rerankScore: null,
158
+ blendedScore: normalizeFusionScore(c.fusionScore),
159
+ })),
160
+ reranked: false,
161
+ };
162
+ }
163
+
164
+ // Map rerank scores to candidates
165
+ // Note: We use normalizeFusionScore defined above (across ALL candidates)
166
+ // Build index->score map for O(1) lookup instead of O(n) find per candidate
167
+ const scoreByIndex = new Map(
168
+ rerankResult.value.map((s) => [s.index, s.score])
169
+ );
170
+ const rerankedCandidates: RerankedCandidate[] = toRerank.map((c, i) => {
171
+ const rerankScore = scoreByIndex.get(i) ?? null;
172
+
173
+ // Normalize rerank score to 0-1 range (models may return different scales)
174
+ const normalizedRerankScore =
175
+ rerankScore !== null ? Math.max(0, Math.min(1, rerankScore)) : null;
176
+
177
+ // Calculate blended score using normalized fusion score
178
+ const position = i + 1;
179
+ const normalizedFusion = normalizeFusionScore(c.fusionScore);
180
+ const blendedScore =
181
+ normalizedRerankScore !== null
182
+ ? blend(normalizedFusion, normalizedRerankScore, position, schedule)
183
+ : normalizedFusion;
184
+
185
+ return {
186
+ ...c,
187
+ rerankScore: normalizedRerankScore,
188
+ blendedScore,
189
+ };
190
+ });
191
+
192
+ // Add remaining candidates (not reranked)
193
+ // These get normalized fusion scores with penalty but clamped to [0,1]
194
+ const allCandidates: RerankedCandidate[] = [
195
+ ...rerankedCandidates,
196
+ ...remaining.map((c) => {
197
+ const base = normalizeFusionScore(c.fusionScore);
198
+ return {
199
+ ...c,
200
+ rerankScore: null,
201
+ // Apply 0.5x penalty and clamp to [0,1]
202
+ blendedScore: Math.max(0, Math.min(1, base * 0.5)),
203
+ };
204
+ }),
205
+ ];
206
+
207
+ // Sort by blended score
208
+ allCandidates.sort((a, b) => {
209
+ const scoreDiff = b.blendedScore - a.blendedScore;
210
+ if (Math.abs(scoreDiff) > 1e-9) {
211
+ return scoreDiff;
212
+ }
213
+ // Deterministic tie-breaking
214
+ const aKey = `${a.mirrorHash}:${a.seq}`;
215
+ const bKey = `${b.mirrorHash}:${b.seq}`;
216
+ return aKey.localeCompare(bKey);
217
+ });
218
+
219
+ return {
220
+ candidates: allCandidates,
221
+ reranked: true,
222
+ };
223
+ }
@@ -0,0 +1,261 @@
1
+ /**
2
+ * BM25 search pipeline.
3
+ * Wraps StorePort.searchFts() to produce SearchResults.
4
+ *
5
+ * @module src/pipeline/search
6
+ */
7
+
8
+ import { join as pathJoin } from 'node:path'; // No Bun path utils equivalent
9
+ import type { ChunkRow, FtsResult, StorePort } from '../store/types';
10
+ import { err, ok } from '../store/types';
11
+ import { createChunkLookup } from './chunk-lookup';
12
+ import { detectQueryLanguage } from './query-language';
13
+ import type {
14
+ SearchOptions,
15
+ SearchResult,
16
+ SearchResultSource,
17
+ SearchResults,
18
+ } from './types';
19
+
20
+ // ─────────────────────────────────────────────────────────────────────────────
21
+ // Score Normalization
22
+ // ─────────────────────────────────────────────────────────────────────────────
23
+
24
+ /**
25
+ * Normalize BM25 scores to 0-1 range using min-max scaling.
26
+ * FTS5 bm25() returns negative scores where smaller (more negative) = better match.
27
+ * After normalization: 1 = best match, 0 = worst match in result set.
28
+ */
29
+ function normalizeBm25Scores(results: SearchResult[]): void {
30
+ if (results.length === 0) {
31
+ return;
32
+ }
33
+
34
+ // Raw scores: smaller (more negative) is better
35
+ const scores = results.map((r) => r.score);
36
+ const best = Math.min(...scores); // Most negative = best
37
+ const worst = Math.max(...scores); // Least negative = worst
38
+ const range = worst - best;
39
+
40
+ // If all scores equal, assign 1.0 to all
41
+ if (range === 0) {
42
+ for (const r of results) {
43
+ r.score = 1;
44
+ }
45
+ return;
46
+ }
47
+
48
+ // Map: best -> 1, worst -> 0 (clamp for floating point safety)
49
+ for (const r of results) {
50
+ r.score = Math.max(0, Math.min(1, (worst - r.score) / range));
51
+ }
52
+ }
53
+
54
+ // ─────────────────────────────────────────────────────────────────────────────
55
+ // Result Building
56
+ // ─────────────────────────────────────────────────────────────────────────────
57
+
58
+ interface BuildResultContext {
59
+ fts: FtsResult;
60
+ chunk: ChunkRow | null;
61
+ collectionPath?: string;
62
+ options?: SearchOptions;
63
+ fullContent?: string;
64
+ }
65
+
66
+ /** Build SearchResult from FtsResult and related data */
67
+ function buildSearchResult(ctx: BuildResultContext): SearchResult {
68
+ const { fts, chunk, collectionPath, options, fullContent } = ctx;
69
+ const source: SearchResultSource = {
70
+ relPath: fts.relPath ?? '',
71
+ // Use actual source metadata with fallback to markdown defaults
72
+ mime: fts.sourceMime ?? 'text/markdown',
73
+ ext: fts.sourceExt ?? '.md',
74
+ modifiedAt: fts.sourceMtime,
75
+ sizeBytes: fts.sourceSize,
76
+ sourceHash: fts.sourceHash,
77
+ };
78
+
79
+ // Add absPath if we have collection path (cross-platform safe)
80
+ if (collectionPath && fts.relPath) {
81
+ source.absPath = pathJoin(collectionPath, fts.relPath);
82
+ }
83
+
84
+ // Determine snippet content and range
85
+ let snippet: string;
86
+ let snippetRange: { startLine: number; endLine: number } | undefined;
87
+
88
+ if (options?.full && fullContent) {
89
+ // --full: use full content, no range (full doc)
90
+ snippet = fullContent;
91
+ snippetRange = undefined;
92
+ } else if (options?.lineNumbers && chunk) {
93
+ // --line-numbers: use raw chunk text (not FTS snippet with markers)
94
+ snippet = chunk.text;
95
+ snippetRange = { startLine: chunk.startLine, endLine: chunk.endLine };
96
+ } else {
97
+ // Default: use FTS snippet or chunk text
98
+ snippet = fts.snippet ?? chunk?.text ?? '';
99
+ snippetRange = chunk
100
+ ? { startLine: chunk.startLine, endLine: chunk.endLine }
101
+ : undefined;
102
+ }
103
+
104
+ return {
105
+ docid: fts.docid ?? '',
106
+ score: fts.score, // Raw score, normalized later as batch
107
+ uri: fts.uri ?? '',
108
+ title: fts.title,
109
+ snippet,
110
+ snippetLanguage: chunk?.language ?? undefined,
111
+ snippetRange,
112
+ source,
113
+ conversion: fts.mirrorHash ? { mirrorHash: fts.mirrorHash } : undefined,
114
+ };
115
+ }
116
+
117
+ // ─────────────────────────────────────────────────────────────────────────────
118
+ // Search Function
119
+ // ─────────────────────────────────────────────────────────────────────────────
120
+
121
+ /**
122
+ * Execute BM25 search and return structured results.
123
+ */
124
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: BM25 search with pagination, filtering, and explain output
125
+ export async function searchBm25(
126
+ store: StorePort,
127
+ query: string,
128
+ options: SearchOptions = {}
129
+ ): Promise<
130
+ ReturnType<typeof ok<SearchResults>> | ReturnType<typeof err<SearchResults>>
131
+ > {
132
+ const limit = options.limit ?? 20;
133
+ const minScore = options.minScore ?? 0;
134
+
135
+ // Detect query language for metadata (DOES NOT affect retrieval filtering)
136
+ const detection = detectQueryLanguage(query);
137
+ const queryLanguage = options.lang ?? detection.bcp47;
138
+
139
+ // Run FTS search
140
+ // Disable FTS snippet when --full or --line-numbers (we use raw text instead)
141
+ const ftsResult = await store.searchFts(query, {
142
+ limit,
143
+ collection: options.collection,
144
+ language: options.lang,
145
+ snippet: !(options.full || options.lineNumbers),
146
+ });
147
+
148
+ if (!ftsResult.ok) {
149
+ // Adapter returns INVALID_INPUT for FTS syntax errors, pass through
150
+ const { code, message, cause } = ftsResult.error;
151
+ if (code === 'INVALID_INPUT') {
152
+ return err('INVALID_INPUT', `Invalid search query: ${message}`);
153
+ }
154
+ return err('QUERY_FAILED', message, cause);
155
+ }
156
+
157
+ // Get collection paths for absPath resolution
158
+ const collectionsResult = await store.getCollections();
159
+ const collectionPaths = new Map<string, string>();
160
+ if (collectionsResult.ok) {
161
+ for (const c of collectionsResult.value) {
162
+ collectionPaths.set(c.name, c.path);
163
+ }
164
+ }
165
+
166
+ // Build results
167
+ const results: SearchResult[] = [];
168
+
169
+ // Pre-fetch all chunks in one batch query (eliminates N+1)
170
+ const uniqueHashes = [
171
+ ...new Set(
172
+ ftsResult.value.map((f) => f.mirrorHash).filter((h): h is string => !!h)
173
+ ),
174
+ ];
175
+ const chunksMapResult = await store.getChunksBatch(uniqueHashes);
176
+ const getChunk = chunksMapResult.ok
177
+ ? createChunkLookup(chunksMapResult.value)
178
+ : () => undefined;
179
+
180
+ // Dedup: multiple docs can share mirror_hash (content-addressed storage)
181
+ // Track seen uri+seq to eliminate duplicate rows from join fan-out
182
+ // Robust key: use uri if present, else fall back to mirrorHash+relPath
183
+ const seenUriSeq = new Set<string>();
184
+ // For --full, track best score per docid to de-dupe
185
+ const bestByDocid = new Map<
186
+ string,
187
+ { fts: FtsResult; chunk: ChunkRow | null; score: number }
188
+ >();
189
+
190
+ for (const fts of ftsResult.value) {
191
+ // Dedup by uri+seq - eliminates rows from mirror_hash join fan-out
192
+ // Use robust key to avoid over-dedup if uri is unexpectedly missing
193
+ const uriSeqKey = fts.uri
194
+ ? `${fts.uri}:${fts.seq}`
195
+ : `${fts.mirrorHash ?? ''}:${fts.seq}:${fts.relPath ?? ''}`;
196
+ if (seenUriSeq.has(uriSeqKey)) {
197
+ continue;
198
+ }
199
+ seenUriSeq.add(uriSeqKey);
200
+
201
+ // Get chunk via O(1) lookup
202
+ const chunk = fts.mirrorHash
203
+ ? (getChunk(fts.mirrorHash, fts.seq) ?? null)
204
+ : null;
205
+
206
+ // For --full, de-dupe by docid (keep best scoring chunk per doc)
207
+ // Raw BM25: smaller (more negative) is better
208
+ if (options.full) {
209
+ const docid = fts.docid ?? '';
210
+ const existing = bestByDocid.get(docid);
211
+ if (!existing || fts.score < existing.score) {
212
+ bestByDocid.set(docid, { fts, chunk, score: fts.score });
213
+ }
214
+ continue;
215
+ }
216
+
217
+ const collectionPath = fts.collection
218
+ ? collectionPaths.get(fts.collection)
219
+ : undefined;
220
+
221
+ results.push(buildSearchResult({ fts, chunk, collectionPath, options }));
222
+ }
223
+
224
+ // For --full, fetch full content and build results
225
+ if (options.full) {
226
+ for (const { fts, chunk } of bestByDocid.values()) {
227
+ let fullContent: string | undefined;
228
+ if (fts.mirrorHash) {
229
+ const contentResult = await store.getContent(fts.mirrorHash);
230
+ if (contentResult.ok && contentResult.value) {
231
+ fullContent = contentResult.value;
232
+ }
233
+ }
234
+ const collectionPath = fts.collection
235
+ ? collectionPaths.get(fts.collection)
236
+ : undefined;
237
+ results.push(
238
+ buildSearchResult({ fts, chunk, collectionPath, options, fullContent })
239
+ );
240
+ }
241
+ }
242
+
243
+ // Normalize scores to 0-1 range (batch min-max)
244
+ normalizeBm25Scores(results);
245
+
246
+ // Apply minScore filter after normalization
247
+ const filteredResults =
248
+ minScore > 0 ? results.filter((r) => r.score >= minScore) : results;
249
+
250
+ return ok({
251
+ results: filteredResults,
252
+ meta: {
253
+ query,
254
+ mode: 'bm25',
255
+ totalResults: filteredResults.length,
256
+ collection: options.collection,
257
+ lang: options.lang,
258
+ queryLanguage,
259
+ },
260
+ });
261
+ }