@gmickel/gno 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +256 -0
- package/assets/skill/SKILL.md +112 -0
- package/assets/skill/cli-reference.md +327 -0
- package/assets/skill/examples.md +234 -0
- package/assets/skill/mcp-reference.md +159 -0
- package/package.json +90 -0
- package/src/app/constants.ts +313 -0
- package/src/cli/colors.ts +65 -0
- package/src/cli/commands/ask.ts +545 -0
- package/src/cli/commands/cleanup.ts +105 -0
- package/src/cli/commands/collection/add.ts +120 -0
- package/src/cli/commands/collection/index.ts +10 -0
- package/src/cli/commands/collection/list.ts +108 -0
- package/src/cli/commands/collection/remove.ts +64 -0
- package/src/cli/commands/collection/rename.ts +95 -0
- package/src/cli/commands/context/add.ts +67 -0
- package/src/cli/commands/context/check.ts +153 -0
- package/src/cli/commands/context/index.ts +10 -0
- package/src/cli/commands/context/list.ts +109 -0
- package/src/cli/commands/context/rm.ts +52 -0
- package/src/cli/commands/doctor.ts +393 -0
- package/src/cli/commands/embed.ts +462 -0
- package/src/cli/commands/get.ts +356 -0
- package/src/cli/commands/index-cmd.ts +119 -0
- package/src/cli/commands/index.ts +102 -0
- package/src/cli/commands/init.ts +328 -0
- package/src/cli/commands/ls.ts +217 -0
- package/src/cli/commands/mcp/config.ts +300 -0
- package/src/cli/commands/mcp/index.ts +24 -0
- package/src/cli/commands/mcp/install.ts +203 -0
- package/src/cli/commands/mcp/paths.ts +470 -0
- package/src/cli/commands/mcp/status.ts +222 -0
- package/src/cli/commands/mcp/uninstall.ts +158 -0
- package/src/cli/commands/mcp.ts +20 -0
- package/src/cli/commands/models/clear.ts +103 -0
- package/src/cli/commands/models/index.ts +32 -0
- package/src/cli/commands/models/list.ts +214 -0
- package/src/cli/commands/models/path.ts +51 -0
- package/src/cli/commands/models/pull.ts +199 -0
- package/src/cli/commands/models/use.ts +85 -0
- package/src/cli/commands/multi-get.ts +400 -0
- package/src/cli/commands/query.ts +220 -0
- package/src/cli/commands/ref-parser.ts +108 -0
- package/src/cli/commands/reset.ts +191 -0
- package/src/cli/commands/search.ts +136 -0
- package/src/cli/commands/shared.ts +156 -0
- package/src/cli/commands/skill/index.ts +19 -0
- package/src/cli/commands/skill/install.ts +197 -0
- package/src/cli/commands/skill/paths-cmd.ts +81 -0
- package/src/cli/commands/skill/paths.ts +191 -0
- package/src/cli/commands/skill/show.ts +73 -0
- package/src/cli/commands/skill/uninstall.ts +141 -0
- package/src/cli/commands/status.ts +205 -0
- package/src/cli/commands/update.ts +68 -0
- package/src/cli/commands/vsearch.ts +188 -0
- package/src/cli/context.ts +64 -0
- package/src/cli/errors.ts +64 -0
- package/src/cli/format/search-results.ts +211 -0
- package/src/cli/options.ts +183 -0
- package/src/cli/program.ts +1330 -0
- package/src/cli/run.ts +213 -0
- package/src/cli/ui.ts +92 -0
- package/src/config/defaults.ts +20 -0
- package/src/config/index.ts +55 -0
- package/src/config/loader.ts +161 -0
- package/src/config/paths.ts +87 -0
- package/src/config/saver.ts +153 -0
- package/src/config/types.ts +280 -0
- package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
- package/src/converters/adapters/officeparser/adapter.ts +126 -0
- package/src/converters/canonicalize.ts +89 -0
- package/src/converters/errors.ts +218 -0
- package/src/converters/index.ts +51 -0
- package/src/converters/mime.ts +163 -0
- package/src/converters/native/markdown.ts +115 -0
- package/src/converters/native/plaintext.ts +56 -0
- package/src/converters/path.ts +48 -0
- package/src/converters/pipeline.ts +159 -0
- package/src/converters/registry.ts +74 -0
- package/src/converters/types.ts +123 -0
- package/src/converters/versions.ts +24 -0
- package/src/index.ts +27 -0
- package/src/ingestion/chunker.ts +238 -0
- package/src/ingestion/index.ts +32 -0
- package/src/ingestion/language.ts +276 -0
- package/src/ingestion/sync.ts +671 -0
- package/src/ingestion/types.ts +219 -0
- package/src/ingestion/walker.ts +235 -0
- package/src/llm/cache.ts +467 -0
- package/src/llm/errors.ts +191 -0
- package/src/llm/index.ts +58 -0
- package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
- package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
- package/src/llm/nodeLlamaCpp/generation.ts +88 -0
- package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
- package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
- package/src/llm/registry.ts +86 -0
- package/src/llm/types.ts +129 -0
- package/src/mcp/resources/index.ts +151 -0
- package/src/mcp/server.ts +229 -0
- package/src/mcp/tools/get.ts +220 -0
- package/src/mcp/tools/index.ts +160 -0
- package/src/mcp/tools/multi-get.ts +263 -0
- package/src/mcp/tools/query.ts +226 -0
- package/src/mcp/tools/search.ts +119 -0
- package/src/mcp/tools/status.ts +81 -0
- package/src/mcp/tools/vsearch.ts +198 -0
- package/src/pipeline/chunk-lookup.ts +44 -0
- package/src/pipeline/expansion.ts +256 -0
- package/src/pipeline/explain.ts +115 -0
- package/src/pipeline/fusion.ts +185 -0
- package/src/pipeline/hybrid.ts +535 -0
- package/src/pipeline/index.ts +64 -0
- package/src/pipeline/query-language.ts +118 -0
- package/src/pipeline/rerank.ts +223 -0
- package/src/pipeline/search.ts +261 -0
- package/src/pipeline/types.ts +328 -0
- package/src/pipeline/vsearch.ts +348 -0
- package/src/store/index.ts +41 -0
- package/src/store/migrations/001-initial.ts +196 -0
- package/src/store/migrations/index.ts +20 -0
- package/src/store/migrations/runner.ts +187 -0
- package/src/store/sqlite/adapter.ts +1242 -0
- package/src/store/sqlite/index.ts +7 -0
- package/src/store/sqlite/setup.ts +129 -0
- package/src/store/sqlite/types.ts +28 -0
- package/src/store/types.ts +506 -0
- package/src/store/vector/index.ts +13 -0
- package/src/store/vector/sqlite-vec.ts +373 -0
- package/src/store/vector/stats.ts +152 -0
- package/src/store/vector/types.ts +115 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Query language detection for prompt selection.
|
|
3
|
+
*
|
|
4
|
+
* IMPORTANT: This affects prompt selection and metadata ONLY.
|
|
5
|
+
* It does NOT affect retrieval filtering - that's controlled by CLI --lang flag.
|
|
6
|
+
*/
|
|
7
|
+
import { franc } from 'franc';
|
|
8
|
+
|
|
9
|
+
const MIN_RELIABLE_LENGTH = 15;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Supported languages for detection.
|
|
13
|
+
* Maps ISO 639-3 codes to BCP-47 (ISO 639-1) codes.
|
|
14
|
+
*
|
|
15
|
+
* Selection criteria:
|
|
16
|
+
* - Major world languages by speaker count
|
|
17
|
+
* - Significant tech/documentation communities
|
|
18
|
+
* - Linguistically distinct (to minimize false positives)
|
|
19
|
+
*/
|
|
20
|
+
const LANG_MAP = {
|
|
21
|
+
// Western European (Germanic)
|
|
22
|
+
eng: 'en', // English
|
|
23
|
+
deu: 'de', // German
|
|
24
|
+
nld: 'nl', // Dutch
|
|
25
|
+
|
|
26
|
+
// Western European (Romance)
|
|
27
|
+
fra: 'fr', // French
|
|
28
|
+
ita: 'it', // Italian
|
|
29
|
+
spa: 'es', // Spanish
|
|
30
|
+
por: 'pt', // Portuguese
|
|
31
|
+
cat: 'ca', // Catalan
|
|
32
|
+
ron: 'ro', // Romanian
|
|
33
|
+
|
|
34
|
+
// Scandinavian
|
|
35
|
+
swe: 'sv', // Swedish
|
|
36
|
+
dan: 'da', // Danish
|
|
37
|
+
nob: 'nb', // Norwegian Bokmål
|
|
38
|
+
nno: 'nn', // Norwegian Nynorsk
|
|
39
|
+
fin: 'fi', // Finnish
|
|
40
|
+
|
|
41
|
+
// Eastern European
|
|
42
|
+
pol: 'pl', // Polish
|
|
43
|
+
ces: 'cs', // Czech
|
|
44
|
+
slk: 'sk', // Slovak
|
|
45
|
+
rus: 'ru', // Russian
|
|
46
|
+
ukr: 'uk', // Ukrainian
|
|
47
|
+
bul: 'bg', // Bulgarian
|
|
48
|
+
hrv: 'hr', // Croatian
|
|
49
|
+
ell: 'el', // Greek
|
|
50
|
+
hun: 'hu', // Hungarian
|
|
51
|
+
|
|
52
|
+
// Middle Eastern
|
|
53
|
+
tur: 'tr', // Turkish
|
|
54
|
+
ara: 'ar', // Arabic
|
|
55
|
+
heb: 'he', // Hebrew
|
|
56
|
+
fas: 'fa', // Persian/Farsi
|
|
57
|
+
|
|
58
|
+
// South Asian
|
|
59
|
+
hin: 'hi', // Hindi
|
|
60
|
+
|
|
61
|
+
// Southeast Asian
|
|
62
|
+
vie: 'vi', // Vietnamese
|
|
63
|
+
tha: 'th', // Thai
|
|
64
|
+
ind: 'id', // Indonesian
|
|
65
|
+
|
|
66
|
+
// East Asian
|
|
67
|
+
cmn: 'zh', // Mandarin Chinese
|
|
68
|
+
jpn: 'ja', // Japanese
|
|
69
|
+
kor: 'ko', // Korean
|
|
70
|
+
} as const;
|
|
71
|
+
|
|
72
|
+
/** ISO 639-3 codes for franc's only filter */
|
|
73
|
+
const SUPPORTED_LANGUAGES = Object.keys(LANG_MAP);
|
|
74
|
+
|
|
75
|
+
export interface LanguageDetection {
|
|
76
|
+
/** BCP-47 code: 'en', 'de', 'fr', etc. 'und' if undetermined */
|
|
77
|
+
bcp47: string;
|
|
78
|
+
/** ISO 639-3 code: 'eng', 'deu', 'fra', etc. 'und' if undetermined */
|
|
79
|
+
iso639_3: string;
|
|
80
|
+
/** false if text too short or language undetermined */
|
|
81
|
+
confident: boolean;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Detect the language of query text for prompt selection.
|
|
86
|
+
*
|
|
87
|
+
* @param text - Query text to analyze
|
|
88
|
+
* @returns Language detection result with BCP-47 code and confidence
|
|
89
|
+
*
|
|
90
|
+
* @example
|
|
91
|
+
* detectQueryLanguage("wie konfiguriere ich kubernetes")
|
|
92
|
+
* // { bcp47: 'de', iso639_3: 'deu', confident: true }
|
|
93
|
+
*
|
|
94
|
+
* detectQueryLanguage("hello")
|
|
95
|
+
* // { bcp47: 'und', iso639_3: 'und', confident: false } // too short
|
|
96
|
+
*/
|
|
97
|
+
export function detectQueryLanguage(text: string): LanguageDetection {
|
|
98
|
+
const trimmed = text.trim();
|
|
99
|
+
|
|
100
|
+
if (trimmed.length < MIN_RELIABLE_LENGTH) {
|
|
101
|
+
return { bcp47: 'und', iso639_3: 'und', confident: false };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const detected = franc(trimmed, {
|
|
105
|
+
minLength: MIN_RELIABLE_LENGTH,
|
|
106
|
+
only: SUPPORTED_LANGUAGES,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
if (detected === 'und') {
|
|
110
|
+
return { bcp47: 'und', iso639_3: 'und', confident: false };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const bcp47 = LANG_MAP[detected as keyof typeof LANG_MAP];
|
|
114
|
+
if (!bcp47) {
|
|
115
|
+
return { bcp47: 'und', iso639_3: 'und', confident: false };
|
|
116
|
+
}
|
|
117
|
+
return { bcp47, iso639_3: detected, confident: true };
|
|
118
|
+
}
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reranking and position-aware blending.
|
|
3
|
+
* Uses RerankPort to reorder candidates.
|
|
4
|
+
*
|
|
5
|
+
* @module src/pipeline/rerank
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { RerankPort } from '../llm/types';
|
|
9
|
+
import type { StorePort } from '../store/types';
|
|
10
|
+
import { createChunkLookup } from './chunk-lookup';
|
|
11
|
+
import type { BlendingTier, FusionCandidate, RerankedCandidate } from './types';
|
|
12
|
+
import { DEFAULT_BLENDING_SCHEDULE } from './types';
|
|
13
|
+
|
|
14
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
15
|
+
// Types
|
|
16
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
export interface RerankOptions {
|
|
19
|
+
/** Max candidates to rerank */
|
|
20
|
+
maxCandidates?: number;
|
|
21
|
+
/** Blending schedule */
|
|
22
|
+
blendingSchedule?: BlendingTier[];
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface RerankResult {
|
|
26
|
+
candidates: RerankedCandidate[];
|
|
27
|
+
reranked: boolean;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface RerankDeps {
|
|
31
|
+
rerankPort: RerankPort | null;
|
|
32
|
+
store: StorePort;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
36
|
+
// Blending
|
|
37
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Get blending weights for a position.
|
|
41
|
+
*/
|
|
42
|
+
function getBlendingWeights(
|
|
43
|
+
position: number,
|
|
44
|
+
schedule: BlendingTier[]
|
|
45
|
+
): { fusionWeight: number; rerankWeight: number } {
|
|
46
|
+
const tier = schedule.find((t) => position <= t.maxRank);
|
|
47
|
+
if (tier) {
|
|
48
|
+
return { fusionWeight: tier.fusionWeight, rerankWeight: tier.rerankWeight };
|
|
49
|
+
}
|
|
50
|
+
// Fallback to last tier
|
|
51
|
+
const last = schedule.at(-1);
|
|
52
|
+
return last
|
|
53
|
+
? { fusionWeight: last.fusionWeight, rerankWeight: last.rerankWeight }
|
|
54
|
+
: { fusionWeight: 0.5, rerankWeight: 0.5 };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Blend fusion and rerank scores.
|
|
59
|
+
*/
|
|
60
|
+
function blend(
|
|
61
|
+
fusionScore: number,
|
|
62
|
+
rerankScore: number,
|
|
63
|
+
position: number,
|
|
64
|
+
schedule: BlendingTier[]
|
|
65
|
+
): number {
|
|
66
|
+
const { fusionWeight, rerankWeight } = getBlendingWeights(position, schedule);
|
|
67
|
+
return fusionWeight * fusionScore + rerankWeight * rerankScore;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
71
|
+
// Rerank Implementation
|
|
72
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Rerank candidates using cross-encoder.
|
|
76
|
+
* Falls back to fusion-only if reranking fails.
|
|
77
|
+
*/
|
|
78
|
+
export async function rerankCandidates(
|
|
79
|
+
deps: RerankDeps,
|
|
80
|
+
query: string,
|
|
81
|
+
candidates: FusionCandidate[],
|
|
82
|
+
options: RerankOptions = {}
|
|
83
|
+
): Promise<RerankResult> {
|
|
84
|
+
// Early return for empty candidates
|
|
85
|
+
if (candidates.length === 0) {
|
|
86
|
+
return { candidates: [], reranked: false };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const { rerankPort, store } = deps;
|
|
90
|
+
const maxCandidates = options.maxCandidates ?? 20;
|
|
91
|
+
const schedule = options.blendingSchedule ?? DEFAULT_BLENDING_SCHEDULE;
|
|
92
|
+
|
|
93
|
+
// Normalize fusion scores to 0-1 range across ALL candidates for stability.
|
|
94
|
+
// This ensures blendedScore is always in [0,1] regardless of reranker availability.
|
|
95
|
+
const fusionScoresAll = candidates.map((c) => c.fusionScore);
|
|
96
|
+
const minFusionAll = Math.min(...fusionScoresAll);
|
|
97
|
+
const maxFusionAll = Math.max(...fusionScoresAll);
|
|
98
|
+
const fusionRangeAll = maxFusionAll - minFusionAll;
|
|
99
|
+
|
|
100
|
+
function normalizeFusionScore(score: number): number {
|
|
101
|
+
if (fusionRangeAll < 1e-9) {
|
|
102
|
+
return 1; // tie for best
|
|
103
|
+
}
|
|
104
|
+
const v = (score - minFusionAll) / fusionRangeAll;
|
|
105
|
+
return Math.max(0, Math.min(1, v));
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// If no reranker, return candidates with normalized fusion scores
|
|
109
|
+
if (!rerankPort) {
|
|
110
|
+
return {
|
|
111
|
+
candidates: candidates.map((c) => ({
|
|
112
|
+
...c,
|
|
113
|
+
rerankScore: null,
|
|
114
|
+
blendedScore: normalizeFusionScore(c.fusionScore),
|
|
115
|
+
})),
|
|
116
|
+
reranked: false,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Limit candidates for reranking
|
|
121
|
+
const toRerank = candidates.slice(0, maxCandidates);
|
|
122
|
+
const remaining = candidates.slice(maxCandidates);
|
|
123
|
+
|
|
124
|
+
// Pre-fetch all chunks in one batch query (eliminates N+1)
|
|
125
|
+
const uniqueHashes = [...new Set(toRerank.map((c) => c.mirrorHash))];
|
|
126
|
+
const chunksMapResult = await store.getChunksBatch(uniqueHashes);
|
|
127
|
+
|
|
128
|
+
// If chunk fetch fails, degrade gracefully (fusion-only)
|
|
129
|
+
// Don't rerank on empty/missing texts - produces non-deterministic results
|
|
130
|
+
if (!chunksMapResult.ok) {
|
|
131
|
+
return {
|
|
132
|
+
candidates: candidates.map((c) => ({
|
|
133
|
+
...c,
|
|
134
|
+
rerankScore: null,
|
|
135
|
+
blendedScore: normalizeFusionScore(c.fusionScore),
|
|
136
|
+
})),
|
|
137
|
+
reranked: false,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
const chunksMap = chunksMapResult.value;
|
|
141
|
+
const getChunk = createChunkLookup(chunksMap);
|
|
142
|
+
|
|
143
|
+
// Build texts array for reranking (O(1) lookup per candidate)
|
|
144
|
+
const texts: string[] = toRerank.map((c) => {
|
|
145
|
+
const chunk = getChunk(c.mirrorHash, c.seq);
|
|
146
|
+
return chunk?.text ?? '';
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
// Run reranking
|
|
150
|
+
const rerankResult = await rerankPort.rerank(query, texts);
|
|
151
|
+
|
|
152
|
+
if (!rerankResult.ok) {
|
|
153
|
+
// Graceful degradation - return normalized fusion scores
|
|
154
|
+
return {
|
|
155
|
+
candidates: candidates.map((c) => ({
|
|
156
|
+
...c,
|
|
157
|
+
rerankScore: null,
|
|
158
|
+
blendedScore: normalizeFusionScore(c.fusionScore),
|
|
159
|
+
})),
|
|
160
|
+
reranked: false,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Map rerank scores to candidates
|
|
165
|
+
// Note: We use normalizeFusionScore defined above (across ALL candidates)
|
|
166
|
+
// Build index->score map for O(1) lookup instead of O(n) find per candidate
|
|
167
|
+
const scoreByIndex = new Map(
|
|
168
|
+
rerankResult.value.map((s) => [s.index, s.score])
|
|
169
|
+
);
|
|
170
|
+
const rerankedCandidates: RerankedCandidate[] = toRerank.map((c, i) => {
|
|
171
|
+
const rerankScore = scoreByIndex.get(i) ?? null;
|
|
172
|
+
|
|
173
|
+
// Normalize rerank score to 0-1 range (models may return different scales)
|
|
174
|
+
const normalizedRerankScore =
|
|
175
|
+
rerankScore !== null ? Math.max(0, Math.min(1, rerankScore)) : null;
|
|
176
|
+
|
|
177
|
+
// Calculate blended score using normalized fusion score
|
|
178
|
+
const position = i + 1;
|
|
179
|
+
const normalizedFusion = normalizeFusionScore(c.fusionScore);
|
|
180
|
+
const blendedScore =
|
|
181
|
+
normalizedRerankScore !== null
|
|
182
|
+
? blend(normalizedFusion, normalizedRerankScore, position, schedule)
|
|
183
|
+
: normalizedFusion;
|
|
184
|
+
|
|
185
|
+
return {
|
|
186
|
+
...c,
|
|
187
|
+
rerankScore: normalizedRerankScore,
|
|
188
|
+
blendedScore,
|
|
189
|
+
};
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
// Add remaining candidates (not reranked)
|
|
193
|
+
// These get normalized fusion scores with penalty but clamped to [0,1]
|
|
194
|
+
const allCandidates: RerankedCandidate[] = [
|
|
195
|
+
...rerankedCandidates,
|
|
196
|
+
...remaining.map((c) => {
|
|
197
|
+
const base = normalizeFusionScore(c.fusionScore);
|
|
198
|
+
return {
|
|
199
|
+
...c,
|
|
200
|
+
rerankScore: null,
|
|
201
|
+
// Apply 0.5x penalty and clamp to [0,1]
|
|
202
|
+
blendedScore: Math.max(0, Math.min(1, base * 0.5)),
|
|
203
|
+
};
|
|
204
|
+
}),
|
|
205
|
+
];
|
|
206
|
+
|
|
207
|
+
// Sort by blended score
|
|
208
|
+
allCandidates.sort((a, b) => {
|
|
209
|
+
const scoreDiff = b.blendedScore - a.blendedScore;
|
|
210
|
+
if (Math.abs(scoreDiff) > 1e-9) {
|
|
211
|
+
return scoreDiff;
|
|
212
|
+
}
|
|
213
|
+
// Deterministic tie-breaking
|
|
214
|
+
const aKey = `${a.mirrorHash}:${a.seq}`;
|
|
215
|
+
const bKey = `${b.mirrorHash}:${b.seq}`;
|
|
216
|
+
return aKey.localeCompare(bKey);
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
return {
|
|
220
|
+
candidates: allCandidates,
|
|
221
|
+
reranked: true,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 search pipeline.
|
|
3
|
+
* Wraps StorePort.searchFts() to produce SearchResults.
|
|
4
|
+
*
|
|
5
|
+
* @module src/pipeline/search
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { join as pathJoin } from 'node:path'; // No Bun path utils equivalent
|
|
9
|
+
import type { ChunkRow, FtsResult, StorePort } from '../store/types';
|
|
10
|
+
import { err, ok } from '../store/types';
|
|
11
|
+
import { createChunkLookup } from './chunk-lookup';
|
|
12
|
+
import { detectQueryLanguage } from './query-language';
|
|
13
|
+
import type {
|
|
14
|
+
SearchOptions,
|
|
15
|
+
SearchResult,
|
|
16
|
+
SearchResultSource,
|
|
17
|
+
SearchResults,
|
|
18
|
+
} from './types';
|
|
19
|
+
|
|
20
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
21
|
+
// Score Normalization
|
|
22
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Normalize BM25 scores to 0-1 range using min-max scaling.
|
|
26
|
+
* FTS5 bm25() returns negative scores where smaller (more negative) = better match.
|
|
27
|
+
* After normalization: 1 = best match, 0 = worst match in result set.
|
|
28
|
+
*/
|
|
29
|
+
function normalizeBm25Scores(results: SearchResult[]): void {
|
|
30
|
+
if (results.length === 0) {
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Raw scores: smaller (more negative) is better
|
|
35
|
+
const scores = results.map((r) => r.score);
|
|
36
|
+
const best = Math.min(...scores); // Most negative = best
|
|
37
|
+
const worst = Math.max(...scores); // Least negative = worst
|
|
38
|
+
const range = worst - best;
|
|
39
|
+
|
|
40
|
+
// If all scores equal, assign 1.0 to all
|
|
41
|
+
if (range === 0) {
|
|
42
|
+
for (const r of results) {
|
|
43
|
+
r.score = 1;
|
|
44
|
+
}
|
|
45
|
+
return;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Map: best -> 1, worst -> 0 (clamp for floating point safety)
|
|
49
|
+
for (const r of results) {
|
|
50
|
+
r.score = Math.max(0, Math.min(1, (worst - r.score) / range));
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
55
|
+
// Result Building
|
|
56
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
interface BuildResultContext {
|
|
59
|
+
fts: FtsResult;
|
|
60
|
+
chunk: ChunkRow | null;
|
|
61
|
+
collectionPath?: string;
|
|
62
|
+
options?: SearchOptions;
|
|
63
|
+
fullContent?: string;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Build SearchResult from FtsResult and related data */
|
|
67
|
+
function buildSearchResult(ctx: BuildResultContext): SearchResult {
|
|
68
|
+
const { fts, chunk, collectionPath, options, fullContent } = ctx;
|
|
69
|
+
const source: SearchResultSource = {
|
|
70
|
+
relPath: fts.relPath ?? '',
|
|
71
|
+
// Use actual source metadata with fallback to markdown defaults
|
|
72
|
+
mime: fts.sourceMime ?? 'text/markdown',
|
|
73
|
+
ext: fts.sourceExt ?? '.md',
|
|
74
|
+
modifiedAt: fts.sourceMtime,
|
|
75
|
+
sizeBytes: fts.sourceSize,
|
|
76
|
+
sourceHash: fts.sourceHash,
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
// Add absPath if we have collection path (cross-platform safe)
|
|
80
|
+
if (collectionPath && fts.relPath) {
|
|
81
|
+
source.absPath = pathJoin(collectionPath, fts.relPath);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Determine snippet content and range
|
|
85
|
+
let snippet: string;
|
|
86
|
+
let snippetRange: { startLine: number; endLine: number } | undefined;
|
|
87
|
+
|
|
88
|
+
if (options?.full && fullContent) {
|
|
89
|
+
// --full: use full content, no range (full doc)
|
|
90
|
+
snippet = fullContent;
|
|
91
|
+
snippetRange = undefined;
|
|
92
|
+
} else if (options?.lineNumbers && chunk) {
|
|
93
|
+
// --line-numbers: use raw chunk text (not FTS snippet with markers)
|
|
94
|
+
snippet = chunk.text;
|
|
95
|
+
snippetRange = { startLine: chunk.startLine, endLine: chunk.endLine };
|
|
96
|
+
} else {
|
|
97
|
+
// Default: use FTS snippet or chunk text
|
|
98
|
+
snippet = fts.snippet ?? chunk?.text ?? '';
|
|
99
|
+
snippetRange = chunk
|
|
100
|
+
? { startLine: chunk.startLine, endLine: chunk.endLine }
|
|
101
|
+
: undefined;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return {
|
|
105
|
+
docid: fts.docid ?? '',
|
|
106
|
+
score: fts.score, // Raw score, normalized later as batch
|
|
107
|
+
uri: fts.uri ?? '',
|
|
108
|
+
title: fts.title,
|
|
109
|
+
snippet,
|
|
110
|
+
snippetLanguage: chunk?.language ?? undefined,
|
|
111
|
+
snippetRange,
|
|
112
|
+
source,
|
|
113
|
+
conversion: fts.mirrorHash ? { mirrorHash: fts.mirrorHash } : undefined,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
118
|
+
// Search Function
|
|
119
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Execute BM25 search and return structured results.
|
|
123
|
+
*/
|
|
124
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: BM25 search with pagination, filtering, and explain output
|
|
125
|
+
export async function searchBm25(
|
|
126
|
+
store: StorePort,
|
|
127
|
+
query: string,
|
|
128
|
+
options: SearchOptions = {}
|
|
129
|
+
): Promise<
|
|
130
|
+
ReturnType<typeof ok<SearchResults>> | ReturnType<typeof err<SearchResults>>
|
|
131
|
+
> {
|
|
132
|
+
const limit = options.limit ?? 20;
|
|
133
|
+
const minScore = options.minScore ?? 0;
|
|
134
|
+
|
|
135
|
+
// Detect query language for metadata (DOES NOT affect retrieval filtering)
|
|
136
|
+
const detection = detectQueryLanguage(query);
|
|
137
|
+
const queryLanguage = options.lang ?? detection.bcp47;
|
|
138
|
+
|
|
139
|
+
// Run FTS search
|
|
140
|
+
// Disable FTS snippet when --full or --line-numbers (we use raw text instead)
|
|
141
|
+
const ftsResult = await store.searchFts(query, {
|
|
142
|
+
limit,
|
|
143
|
+
collection: options.collection,
|
|
144
|
+
language: options.lang,
|
|
145
|
+
snippet: !(options.full || options.lineNumbers),
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
if (!ftsResult.ok) {
|
|
149
|
+
// Adapter returns INVALID_INPUT for FTS syntax errors, pass through
|
|
150
|
+
const { code, message, cause } = ftsResult.error;
|
|
151
|
+
if (code === 'INVALID_INPUT') {
|
|
152
|
+
return err('INVALID_INPUT', `Invalid search query: ${message}`);
|
|
153
|
+
}
|
|
154
|
+
return err('QUERY_FAILED', message, cause);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Get collection paths for absPath resolution
|
|
158
|
+
const collectionsResult = await store.getCollections();
|
|
159
|
+
const collectionPaths = new Map<string, string>();
|
|
160
|
+
if (collectionsResult.ok) {
|
|
161
|
+
for (const c of collectionsResult.value) {
|
|
162
|
+
collectionPaths.set(c.name, c.path);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Build results
|
|
167
|
+
const results: SearchResult[] = [];
|
|
168
|
+
|
|
169
|
+
// Pre-fetch all chunks in one batch query (eliminates N+1)
|
|
170
|
+
const uniqueHashes = [
|
|
171
|
+
...new Set(
|
|
172
|
+
ftsResult.value.map((f) => f.mirrorHash).filter((h): h is string => !!h)
|
|
173
|
+
),
|
|
174
|
+
];
|
|
175
|
+
const chunksMapResult = await store.getChunksBatch(uniqueHashes);
|
|
176
|
+
const getChunk = chunksMapResult.ok
|
|
177
|
+
? createChunkLookup(chunksMapResult.value)
|
|
178
|
+
: () => undefined;
|
|
179
|
+
|
|
180
|
+
// Dedup: multiple docs can share mirror_hash (content-addressed storage)
|
|
181
|
+
// Track seen uri+seq to eliminate duplicate rows from join fan-out
|
|
182
|
+
// Robust key: use uri if present, else fall back to mirrorHash+relPath
|
|
183
|
+
const seenUriSeq = new Set<string>();
|
|
184
|
+
// For --full, track best score per docid to de-dupe
|
|
185
|
+
const bestByDocid = new Map<
|
|
186
|
+
string,
|
|
187
|
+
{ fts: FtsResult; chunk: ChunkRow | null; score: number }
|
|
188
|
+
>();
|
|
189
|
+
|
|
190
|
+
for (const fts of ftsResult.value) {
|
|
191
|
+
// Dedup by uri+seq - eliminates rows from mirror_hash join fan-out
|
|
192
|
+
// Use robust key to avoid over-dedup if uri is unexpectedly missing
|
|
193
|
+
const uriSeqKey = fts.uri
|
|
194
|
+
? `${fts.uri}:${fts.seq}`
|
|
195
|
+
: `${fts.mirrorHash ?? ''}:${fts.seq}:${fts.relPath ?? ''}`;
|
|
196
|
+
if (seenUriSeq.has(uriSeqKey)) {
|
|
197
|
+
continue;
|
|
198
|
+
}
|
|
199
|
+
seenUriSeq.add(uriSeqKey);
|
|
200
|
+
|
|
201
|
+
// Get chunk via O(1) lookup
|
|
202
|
+
const chunk = fts.mirrorHash
|
|
203
|
+
? (getChunk(fts.mirrorHash, fts.seq) ?? null)
|
|
204
|
+
: null;
|
|
205
|
+
|
|
206
|
+
// For --full, de-dupe by docid (keep best scoring chunk per doc)
|
|
207
|
+
// Raw BM25: smaller (more negative) is better
|
|
208
|
+
if (options.full) {
|
|
209
|
+
const docid = fts.docid ?? '';
|
|
210
|
+
const existing = bestByDocid.get(docid);
|
|
211
|
+
if (!existing || fts.score < existing.score) {
|
|
212
|
+
bestByDocid.set(docid, { fts, chunk, score: fts.score });
|
|
213
|
+
}
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const collectionPath = fts.collection
|
|
218
|
+
? collectionPaths.get(fts.collection)
|
|
219
|
+
: undefined;
|
|
220
|
+
|
|
221
|
+
results.push(buildSearchResult({ fts, chunk, collectionPath, options }));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// For --full, fetch full content and build results
|
|
225
|
+
if (options.full) {
|
|
226
|
+
for (const { fts, chunk } of bestByDocid.values()) {
|
|
227
|
+
let fullContent: string | undefined;
|
|
228
|
+
if (fts.mirrorHash) {
|
|
229
|
+
const contentResult = await store.getContent(fts.mirrorHash);
|
|
230
|
+
if (contentResult.ok && contentResult.value) {
|
|
231
|
+
fullContent = contentResult.value;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
const collectionPath = fts.collection
|
|
235
|
+
? collectionPaths.get(fts.collection)
|
|
236
|
+
: undefined;
|
|
237
|
+
results.push(
|
|
238
|
+
buildSearchResult({ fts, chunk, collectionPath, options, fullContent })
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Normalize scores to 0-1 range (batch min-max)
|
|
244
|
+
normalizeBm25Scores(results);
|
|
245
|
+
|
|
246
|
+
// Apply minScore filter after normalization
|
|
247
|
+
const filteredResults =
|
|
248
|
+
minScore > 0 ? results.filter((r) => r.score >= minScore) : results;
|
|
249
|
+
|
|
250
|
+
return ok({
|
|
251
|
+
results: filteredResults,
|
|
252
|
+
meta: {
|
|
253
|
+
query,
|
|
254
|
+
mode: 'bm25',
|
|
255
|
+
totalResults: filteredResults.length,
|
|
256
|
+
collection: options.collection,
|
|
257
|
+
lang: options.lang,
|
|
258
|
+
queryLanguage,
|
|
259
|
+
},
|
|
260
|
+
});
|
|
261
|
+
}
|