@tobilu/qmd 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/LICENSE +21 -0
- package/README.md +615 -0
- package/package.json +80 -0
- package/qmd +55 -0
- package/src/collections.ts +390 -0
- package/src/formatter.ts +429 -0
- package/src/llm.ts +1208 -0
- package/src/mcp.ts +654 -0
- package/src/qmd.ts +2535 -0
- package/src/store.ts +3072 -0
package/src/store.ts
ADDED
|
@@ -0,0 +1,3072 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* QMD Store - Core data access and retrieval functions
|
|
3
|
+
*
|
|
4
|
+
* This module provides all database operations, search functions, and document
|
|
5
|
+
* retrieval for QMD. It returns raw data structures that can be formatted by
|
|
6
|
+
* CLI or MCP consumers.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* const store = createStore("/path/to/db.sqlite");
|
|
10
|
+
* // or use default path:
|
|
11
|
+
* const store = createStore();
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { Database } from "bun:sqlite";
|
|
15
|
+
import { Glob } from "bun";
|
|
16
|
+
import { realpathSync, statSync } from "node:fs";
|
|
17
|
+
import * as sqliteVec from "sqlite-vec";
|
|
18
|
+
import {
|
|
19
|
+
LlamaCpp,
|
|
20
|
+
getDefaultLlamaCpp,
|
|
21
|
+
formatQueryForEmbedding,
|
|
22
|
+
formatDocForEmbedding,
|
|
23
|
+
type RerankDocument,
|
|
24
|
+
type ILLMSession,
|
|
25
|
+
} from "./llm";
|
|
26
|
+
import {
|
|
27
|
+
findContextForPath as collectionsFindContextForPath,
|
|
28
|
+
addContext as collectionsAddContext,
|
|
29
|
+
removeContext as collectionsRemoveContext,
|
|
30
|
+
listAllContexts as collectionsListAllContexts,
|
|
31
|
+
getCollection,
|
|
32
|
+
listCollections as collectionsListCollections,
|
|
33
|
+
addCollection as collectionsAddCollection,
|
|
34
|
+
removeCollection as collectionsRemoveCollection,
|
|
35
|
+
renameCollection as collectionsRenameCollection,
|
|
36
|
+
setGlobalContext,
|
|
37
|
+
loadConfig as collectionsLoadConfig,
|
|
38
|
+
type NamedCollection,
|
|
39
|
+
} from "./collections";
|
|
40
|
+
|
|
41
|
+
// =============================================================================
|
|
42
|
+
// Configuration
|
|
43
|
+
// =============================================================================
|
|
44
|
+
|
|
45
|
+
const HOME = Bun.env.HOME || "/tmp";
|
|
46
|
+
export const DEFAULT_EMBED_MODEL = "embeddinggemma";
|
|
47
|
+
export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
|
|
48
|
+
export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
|
|
49
|
+
export const DEFAULT_GLOB = "**/*.md";
|
|
50
|
+
export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
|
|
51
|
+
|
|
52
|
+
// Chunking: 900 tokens per chunk with 15% overlap
|
|
53
|
+
// Increased from 800 to accommodate smart chunking finding natural break points
|
|
54
|
+
export const CHUNK_SIZE_TOKENS = 900;
|
|
55
|
+
export const CHUNK_OVERLAP_TOKENS = Math.floor(CHUNK_SIZE_TOKENS * 0.15); // 135 tokens (15% overlap)
|
|
56
|
+
// Fallback char-based approximation for sync chunking (~4 chars per token)
|
|
57
|
+
export const CHUNK_SIZE_CHARS = CHUNK_SIZE_TOKENS * 4; // 3600 chars
|
|
58
|
+
export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
|
|
59
|
+
// Search window for finding optimal break points (in tokens, ~200 tokens)
|
|
60
|
+
export const CHUNK_WINDOW_TOKENS = 200;
|
|
61
|
+
export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
|
|
62
|
+
|
|
63
|
+
// =============================================================================
|
|
64
|
+
// Smart Chunking - Break Point Detection
|
|
65
|
+
// =============================================================================
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* A potential break point in the document with a base score indicating quality.
|
|
69
|
+
*/
|
|
70
|
+
export interface BreakPoint {
|
|
71
|
+
pos: number; // character position
|
|
72
|
+
score: number; // base score (higher = better break point)
|
|
73
|
+
type: string; // for debugging: 'h1', 'h2', 'blank', etc.
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* A region where a code fence exists (between ``` markers).
|
|
78
|
+
* We should never split inside a code fence.
|
|
79
|
+
*/
|
|
80
|
+
export interface CodeFenceRegion {
|
|
81
|
+
start: number; // position of opening ```
|
|
82
|
+
end: number; // position of closing ``` (or document end if unclosed)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Patterns for detecting break points in markdown documents.
|
|
87
|
+
* Higher scores indicate better places to split.
|
|
88
|
+
* Scores are spread wide so headings decisively beat lower-quality breaks.
|
|
89
|
+
* Order matters for scoring - more specific patterns first.
|
|
90
|
+
*/
|
|
91
|
+
export const BREAK_PATTERNS: [RegExp, number, string][] = [
|
|
92
|
+
[/\n#{1}(?!#)/g, 100, 'h1'], // # but not ##
|
|
93
|
+
[/\n#{2}(?!#)/g, 90, 'h2'], // ## but not ###
|
|
94
|
+
[/\n#{3}(?!#)/g, 80, 'h3'], // ### but not ####
|
|
95
|
+
[/\n#{4}(?!#)/g, 70, 'h4'], // #### but not #####
|
|
96
|
+
[/\n#{5}(?!#)/g, 60, 'h5'], // ##### but not ######
|
|
97
|
+
[/\n#{6}(?!#)/g, 50, 'h6'], // ######
|
|
98
|
+
[/\n```/g, 80, 'codeblock'], // code block boundary (same as h3)
|
|
99
|
+
[/\n(?:---|\*\*\*|___)\s*\n/g, 60, 'hr'], // horizontal rule
|
|
100
|
+
[/\n\n+/g, 20, 'blank'], // paragraph boundary
|
|
101
|
+
[/\n[-*]\s/g, 5, 'list'], // unordered list item
|
|
102
|
+
[/\n\d+\.\s/g, 5, 'numlist'], // ordered list item
|
|
103
|
+
[/\n/g, 1, 'newline'], // minimal break
|
|
104
|
+
];
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Scan text for all potential break points.
|
|
108
|
+
* Returns sorted array of break points with higher-scoring patterns taking precedence
|
|
109
|
+
* when multiple patterns match the same position.
|
|
110
|
+
*/
|
|
111
|
+
export function scanBreakPoints(text: string): BreakPoint[] {
|
|
112
|
+
const points: BreakPoint[] = [];
|
|
113
|
+
const seen = new Map<number, BreakPoint>(); // pos -> best break point at that pos
|
|
114
|
+
|
|
115
|
+
for (const [pattern, score, type] of BREAK_PATTERNS) {
|
|
116
|
+
for (const match of text.matchAll(pattern)) {
|
|
117
|
+
const pos = match.index!;
|
|
118
|
+
const existing = seen.get(pos);
|
|
119
|
+
// Keep higher score if position already seen
|
|
120
|
+
if (!existing || score > existing.score) {
|
|
121
|
+
const bp = { pos, score, type };
|
|
122
|
+
seen.set(pos, bp);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Convert to array and sort by position
|
|
128
|
+
for (const bp of seen.values()) {
|
|
129
|
+
points.push(bp);
|
|
130
|
+
}
|
|
131
|
+
return points.sort((a, b) => a.pos - b.pos);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Find all code fence regions in the text.
|
|
136
|
+
* Code fences are delimited by ``` and we should never split inside them.
|
|
137
|
+
*/
|
|
138
|
+
export function findCodeFences(text: string): CodeFenceRegion[] {
|
|
139
|
+
const regions: CodeFenceRegion[] = [];
|
|
140
|
+
const fencePattern = /\n```/g;
|
|
141
|
+
let inFence = false;
|
|
142
|
+
let fenceStart = 0;
|
|
143
|
+
|
|
144
|
+
for (const match of text.matchAll(fencePattern)) {
|
|
145
|
+
if (!inFence) {
|
|
146
|
+
fenceStart = match.index!;
|
|
147
|
+
inFence = true;
|
|
148
|
+
} else {
|
|
149
|
+
regions.push({ start: fenceStart, end: match.index! + match[0].length });
|
|
150
|
+
inFence = false;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Handle unclosed fence - extends to end of document
|
|
155
|
+
if (inFence) {
|
|
156
|
+
regions.push({ start: fenceStart, end: text.length });
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return regions;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Check if a position is inside a code fence region.
|
|
164
|
+
*/
|
|
165
|
+
export function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]): boolean {
|
|
166
|
+
return fences.some(f => pos > f.start && pos < f.end);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Find the best cut position using scored break points with distance decay.
|
|
171
|
+
*
|
|
172
|
+
* Uses squared distance for gentler early decay - headings far back still win
|
|
173
|
+
* over low-quality breaks near the target.
|
|
174
|
+
*
|
|
175
|
+
* @param breakPoints - Pre-scanned break points from scanBreakPoints()
|
|
176
|
+
* @param targetCharPos - The ideal cut position (e.g., maxChars boundary)
|
|
177
|
+
* @param windowChars - How far back to search for break points (default ~200 tokens)
|
|
178
|
+
* @param decayFactor - How much to penalize distance (0.7 = 30% score at window edge)
|
|
179
|
+
* @param codeFences - Code fence regions to avoid splitting inside
|
|
180
|
+
* @returns The best position to cut at
|
|
181
|
+
*/
|
|
182
|
+
export function findBestCutoff(
|
|
183
|
+
breakPoints: BreakPoint[],
|
|
184
|
+
targetCharPos: number,
|
|
185
|
+
windowChars: number = CHUNK_WINDOW_CHARS,
|
|
186
|
+
decayFactor: number = 0.7,
|
|
187
|
+
codeFences: CodeFenceRegion[] = []
|
|
188
|
+
): number {
|
|
189
|
+
const windowStart = targetCharPos - windowChars;
|
|
190
|
+
let bestScore = -1;
|
|
191
|
+
let bestPos = targetCharPos;
|
|
192
|
+
|
|
193
|
+
for (const bp of breakPoints) {
|
|
194
|
+
if (bp.pos < windowStart) continue;
|
|
195
|
+
if (bp.pos > targetCharPos) break; // sorted, so we can stop
|
|
196
|
+
|
|
197
|
+
// Skip break points inside code fences
|
|
198
|
+
if (isInsideCodeFence(bp.pos, codeFences)) continue;
|
|
199
|
+
|
|
200
|
+
const distance = targetCharPos - bp.pos;
|
|
201
|
+
// Squared distance decay: gentle early, steep late
|
|
202
|
+
// At target: multiplier = 1.0
|
|
203
|
+
// At 25% back: multiplier = 0.956
|
|
204
|
+
// At 50% back: multiplier = 0.825
|
|
205
|
+
// At 75% back: multiplier = 0.606
|
|
206
|
+
// At window edge: multiplier = 0.3
|
|
207
|
+
const normalizedDist = distance / windowChars;
|
|
208
|
+
const multiplier = 1.0 - (normalizedDist * normalizedDist) * decayFactor;
|
|
209
|
+
const finalScore = bp.score * multiplier;
|
|
210
|
+
|
|
211
|
+
if (finalScore > bestScore) {
|
|
212
|
+
bestScore = finalScore;
|
|
213
|
+
bestPos = bp.pos;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return bestPos;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Hybrid query: strong BM25 signal detection thresholds
|
|
221
|
+
// Skip expensive LLM expansion when top result is strong AND clearly separated from runner-up
|
|
222
|
+
export const STRONG_SIGNAL_MIN_SCORE = 0.85;
|
|
223
|
+
export const STRONG_SIGNAL_MIN_GAP = 0.15;
|
|
224
|
+
// Max candidates to pass to reranker — balances quality vs latency.
|
|
225
|
+
// 40 keeps rank 31-40 visible to the reranker (matters for recall on broad queries).
|
|
226
|
+
export const RERANK_CANDIDATE_LIMIT = 40;
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* A typed query expansion result. Decoupled from llm.ts internal Queryable —
|
|
230
|
+
* same shape, but store.ts owns its own public API type.
|
|
231
|
+
*
|
|
232
|
+
* - lex: keyword variant → routes to FTS only
|
|
233
|
+
* - vec: semantic variant → routes to vector only
|
|
234
|
+
* - hyde: hypothetical document → routes to vector only
|
|
235
|
+
*/
|
|
236
|
+
export type ExpandedQuery = {
|
|
237
|
+
type: 'lex' | 'vec' | 'hyde';
|
|
238
|
+
text: string;
|
|
239
|
+
};
|
|
240
|
+
|
|
241
|
+
// =============================================================================
|
|
242
|
+
// Path utilities
|
|
243
|
+
// =============================================================================
|
|
244
|
+
|
|
245
|
+
export function homedir(): string {
|
|
246
|
+
return HOME;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Check if a path is absolute.
|
|
251
|
+
* Supports:
|
|
252
|
+
* - Unix paths: /path/to/file
|
|
253
|
+
* - Windows native: C:\path or C:/path
|
|
254
|
+
* - Git Bash: /c/path or /C/path (C-Z drives, excluding A/B floppy drives)
|
|
255
|
+
*
|
|
256
|
+
* Note: /c without trailing slash is treated as Unix path (directory named "c"),
|
|
257
|
+
* while /c/ or /c/path are treated as Git Bash paths (C: drive).
|
|
258
|
+
*/
|
|
259
|
+
export function isAbsolutePath(path: string): boolean {
|
|
260
|
+
if (!path) return false;
|
|
261
|
+
|
|
262
|
+
// Unix absolute path
|
|
263
|
+
if (path.startsWith('/')) {
|
|
264
|
+
// Check if it's a Git Bash style path like /c/ or /c/Users (C-Z only, not A or B)
|
|
265
|
+
// Requires path[2] === '/' to distinguish from Unix paths like /c or /cache
|
|
266
|
+
if (path.length >= 3 && path[2] === '/') {
|
|
267
|
+
const driveLetter = path[1];
|
|
268
|
+
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
269
|
+
return true;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
// Any other path starting with / is Unix absolute
|
|
273
|
+
return true;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Windows native path: C:\ or C:/ (any letter A-Z)
|
|
277
|
+
if (path.length >= 2 && /[a-zA-Z]/.test(path[0]!) && path[1] === ':') {
|
|
278
|
+
return true;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return false;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Normalize path separators to forward slashes.
|
|
286
|
+
* Converts Windows backslashes to forward slashes.
|
|
287
|
+
*/
|
|
288
|
+
export function normalizePathSeparators(path: string): string {
|
|
289
|
+
return path.replace(/\\/g, '/');
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* Get the relative path from a prefix.
|
|
294
|
+
* Returns null if path is not under prefix.
|
|
295
|
+
* Returns empty string if path equals prefix.
|
|
296
|
+
*/
|
|
297
|
+
export function getRelativePathFromPrefix(path: string, prefix: string): string | null {
|
|
298
|
+
// Empty prefix is invalid
|
|
299
|
+
if (!prefix) {
|
|
300
|
+
return null;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
const normalizedPath = normalizePathSeparators(path);
|
|
304
|
+
const normalizedPrefix = normalizePathSeparators(prefix);
|
|
305
|
+
|
|
306
|
+
// Ensure prefix ends with / for proper matching
|
|
307
|
+
const prefixWithSlash = !normalizedPrefix.endsWith('/')
|
|
308
|
+
? normalizedPrefix + '/'
|
|
309
|
+
: normalizedPrefix;
|
|
310
|
+
|
|
311
|
+
// Exact match
|
|
312
|
+
if (normalizedPath === normalizedPrefix) {
|
|
313
|
+
return '';
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Check if path starts with prefix
|
|
317
|
+
if (normalizedPath.startsWith(prefixWithSlash)) {
|
|
318
|
+
return normalizedPath.slice(prefixWithSlash.length);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return null;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
export function resolve(...paths: string[]): string {
|
|
325
|
+
if (paths.length === 0) {
|
|
326
|
+
throw new Error("resolve: at least one path segment is required");
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Normalize all paths to use forward slashes
|
|
330
|
+
const normalizedPaths = paths.map(normalizePathSeparators);
|
|
331
|
+
|
|
332
|
+
let result = '';
|
|
333
|
+
let windowsDrive = '';
|
|
334
|
+
|
|
335
|
+
// Check if first path is absolute
|
|
336
|
+
const firstPath = normalizedPaths[0]!;
|
|
337
|
+
if (isAbsolutePath(firstPath)) {
|
|
338
|
+
result = firstPath;
|
|
339
|
+
|
|
340
|
+
// Extract Windows drive letter if present
|
|
341
|
+
if (firstPath.length >= 2 && /[a-zA-Z]/.test(firstPath[0]!) && firstPath[1] === ':') {
|
|
342
|
+
windowsDrive = firstPath.slice(0, 2);
|
|
343
|
+
result = firstPath.slice(2);
|
|
344
|
+
} else if (firstPath.startsWith('/') && firstPath.length >= 3 && firstPath[2] === '/') {
|
|
345
|
+
// Git Bash style: /c/ -> C: (C-Z drives only, not A or B)
|
|
346
|
+
const driveLetter = firstPath[1];
|
|
347
|
+
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
348
|
+
windowsDrive = driveLetter.toUpperCase() + ':';
|
|
349
|
+
result = firstPath.slice(2);
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
} else {
|
|
353
|
+
// Start with PWD or cwd, then append the first relative path
|
|
354
|
+
const pwd = normalizePathSeparators(Bun.env.PWD || process.cwd());
|
|
355
|
+
|
|
356
|
+
// Extract Windows drive from PWD if present
|
|
357
|
+
if (pwd.length >= 2 && /[a-zA-Z]/.test(pwd[0]!) && pwd[1] === ':') {
|
|
358
|
+
windowsDrive = pwd.slice(0, 2);
|
|
359
|
+
result = pwd.slice(2) + '/' + firstPath;
|
|
360
|
+
} else {
|
|
361
|
+
result = pwd + '/' + firstPath;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Process remaining paths
|
|
366
|
+
for (let i = 1; i < normalizedPaths.length; i++) {
|
|
367
|
+
const p = normalizedPaths[i]!;
|
|
368
|
+
if (isAbsolutePath(p)) {
|
|
369
|
+
// Absolute path replaces everything
|
|
370
|
+
result = p;
|
|
371
|
+
|
|
372
|
+
// Update Windows drive if present
|
|
373
|
+
if (p.length >= 2 && /[a-zA-Z]/.test(p[0]!) && p[1] === ':') {
|
|
374
|
+
windowsDrive = p.slice(0, 2);
|
|
375
|
+
result = p.slice(2);
|
|
376
|
+
} else if (p.startsWith('/') && p.length >= 3 && p[2] === '/') {
|
|
377
|
+
// Git Bash style (C-Z drives only, not A or B)
|
|
378
|
+
const driveLetter = p[1];
|
|
379
|
+
if (driveLetter && /[c-zC-Z]/.test(driveLetter)) {
|
|
380
|
+
windowsDrive = driveLetter.toUpperCase() + ':';
|
|
381
|
+
result = p.slice(2);
|
|
382
|
+
} else {
|
|
383
|
+
windowsDrive = '';
|
|
384
|
+
}
|
|
385
|
+
} else {
|
|
386
|
+
windowsDrive = '';
|
|
387
|
+
}
|
|
388
|
+
} else {
|
|
389
|
+
// Relative path - append
|
|
390
|
+
result = result + '/' + p;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// Normalize . and .. components
|
|
395
|
+
const parts = result.split('/').filter(Boolean);
|
|
396
|
+
const normalized: string[] = [];
|
|
397
|
+
for (const part of parts) {
|
|
398
|
+
if (part === '..') {
|
|
399
|
+
normalized.pop();
|
|
400
|
+
} else if (part !== '.') {
|
|
401
|
+
normalized.push(part);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// Build final path
|
|
406
|
+
const finalPath = '/' + normalized.join('/');
|
|
407
|
+
|
|
408
|
+
// Prepend Windows drive if present
|
|
409
|
+
if (windowsDrive) {
|
|
410
|
+
return windowsDrive + finalPath;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return finalPath;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Flag to indicate production mode (set by qmd.ts at startup)
|
|
417
|
+
let _productionMode = false;
|
|
418
|
+
|
|
419
|
+
export function enableProductionMode(): void {
|
|
420
|
+
_productionMode = true;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
export function getDefaultDbPath(indexName: string = "index"): string {
|
|
424
|
+
// Always allow override via INDEX_PATH (for testing)
|
|
425
|
+
if (Bun.env.INDEX_PATH) {
|
|
426
|
+
return Bun.env.INDEX_PATH;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// In non-production mode (tests), require explicit path
|
|
430
|
+
if (!_productionMode) {
|
|
431
|
+
throw new Error(
|
|
432
|
+
"Database path not set. Tests must set INDEX_PATH env var or use createStore() with explicit path. " +
|
|
433
|
+
"This prevents tests from accidentally writing to the global index."
|
|
434
|
+
);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
const cacheDir = Bun.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
|
|
438
|
+
const qmdCacheDir = resolve(cacheDir, "qmd");
|
|
439
|
+
try { Bun.spawnSync(["mkdir", "-p", qmdCacheDir]); } catch { }
|
|
440
|
+
return resolve(qmdCacheDir, `${indexName}.sqlite`);
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
export function getPwd(): string {
|
|
444
|
+
return process.env.PWD || process.cwd();
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
export function getRealPath(path: string): string {
|
|
448
|
+
try {
|
|
449
|
+
return realpathSync(path);
|
|
450
|
+
} catch {
|
|
451
|
+
return resolve(path);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// =============================================================================
|
|
456
|
+
// Virtual Path Utilities (qmd://)
|
|
457
|
+
// =============================================================================
|
|
458
|
+
|
|
459
|
+
export type VirtualPath = {
|
|
460
|
+
collectionName: string;
|
|
461
|
+
path: string; // relative path within collection
|
|
462
|
+
};
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Normalize explicit virtual path formats to standard qmd:// format.
|
|
466
|
+
* Only handles paths that are already explicitly virtual:
|
|
467
|
+
* - qmd://collection/path.md (already normalized)
|
|
468
|
+
* - qmd:////collection/path.md (extra slashes - normalize)
|
|
469
|
+
* - //collection/path.md (missing qmd: prefix - add it)
|
|
470
|
+
*
|
|
471
|
+
* Does NOT handle:
|
|
472
|
+
* - collection/path.md (bare paths - could be filesystem relative)
|
|
473
|
+
* - :linenum suffix (should be parsed separately before calling this)
|
|
474
|
+
*/
|
|
475
|
+
export function normalizeVirtualPath(input: string): string {
|
|
476
|
+
let path = input.trim();
|
|
477
|
+
|
|
478
|
+
// Handle qmd:// with extra slashes: qmd:////collection/path -> qmd://collection/path
|
|
479
|
+
if (path.startsWith('qmd:')) {
|
|
480
|
+
// Remove qmd: prefix and normalize slashes
|
|
481
|
+
path = path.slice(4);
|
|
482
|
+
// Remove leading slashes and re-add exactly two
|
|
483
|
+
path = path.replace(/^\/+/, '');
|
|
484
|
+
return `qmd://${path}`;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Handle //collection/path (missing qmd: prefix)
|
|
488
|
+
if (path.startsWith('//')) {
|
|
489
|
+
path = path.replace(/^\/+/, '');
|
|
490
|
+
return `qmd://${path}`;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// Return as-is for other cases (filesystem paths, docids, bare collection/path, etc.)
|
|
494
|
+
return path;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
/**
|
|
498
|
+
* Parse a virtual path like "qmd://collection-name/path/to/file.md"
|
|
499
|
+
* into its components.
|
|
500
|
+
* Also supports collection root: "qmd://collection-name/" or "qmd://collection-name"
|
|
501
|
+
*/
|
|
502
|
+
export function parseVirtualPath(virtualPath: string): VirtualPath | null {
|
|
503
|
+
// Normalize the path first
|
|
504
|
+
const normalized = normalizeVirtualPath(virtualPath);
|
|
505
|
+
|
|
506
|
+
// Match: qmd://collection-name[/optional-path]
|
|
507
|
+
// Allows: qmd://name, qmd://name/, qmd://name/path
|
|
508
|
+
const match = normalized.match(/^qmd:\/\/([^\/]+)\/?(.*)$/);
|
|
509
|
+
if (!match?.[1]) return null;
|
|
510
|
+
return {
|
|
511
|
+
collectionName: match[1],
|
|
512
|
+
path: match[2] ?? '', // Empty string for collection root
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
/**
|
|
517
|
+
* Build a virtual path from collection name and relative path.
|
|
518
|
+
*/
|
|
519
|
+
export function buildVirtualPath(collectionName: string, path: string): string {
|
|
520
|
+
return `qmd://${collectionName}/${path}`;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
/**
|
|
524
|
+
* Check if a path is explicitly a virtual path.
|
|
525
|
+
* Only recognizes explicit virtual path formats:
|
|
526
|
+
* - qmd://collection/path.md
|
|
527
|
+
* - //collection/path.md
|
|
528
|
+
*
|
|
529
|
+
* Does NOT consider bare collection/path.md as virtual - that should be
|
|
530
|
+
* handled separately by checking if the first component is a collection name.
|
|
531
|
+
*/
|
|
532
|
+
export function isVirtualPath(path: string): boolean {
|
|
533
|
+
const trimmed = path.trim();
|
|
534
|
+
|
|
535
|
+
// Explicit qmd:// prefix (with any number of slashes)
|
|
536
|
+
if (trimmed.startsWith('qmd:')) return true;
|
|
537
|
+
|
|
538
|
+
// //collection/path format (missing qmd: prefix)
|
|
539
|
+
if (trimmed.startsWith('//')) return true;
|
|
540
|
+
|
|
541
|
+
return false;
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
/**
|
|
545
|
+
* Resolve a virtual path to absolute filesystem path.
|
|
546
|
+
*/
|
|
547
|
+
export function resolveVirtualPath(db: Database, virtualPath: string): string | null {
|
|
548
|
+
const parsed = parseVirtualPath(virtualPath);
|
|
549
|
+
if (!parsed) return null;
|
|
550
|
+
|
|
551
|
+
const coll = getCollectionByName(db, parsed.collectionName);
|
|
552
|
+
if (!coll) return null;
|
|
553
|
+
|
|
554
|
+
return resolve(coll.pwd, parsed.path);
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
/**
|
|
558
|
+
* Convert an absolute filesystem path to a virtual path.
|
|
559
|
+
* Returns null if the file is not in any indexed collection.
|
|
560
|
+
*/
|
|
561
|
+
export function toVirtualPath(db: Database, absolutePath: string): string | null {
|
|
562
|
+
// Get all collections from YAML config
|
|
563
|
+
const collections = collectionsListCollections();
|
|
564
|
+
|
|
565
|
+
// Find which collection this absolute path belongs to
|
|
566
|
+
for (const coll of collections) {
|
|
567
|
+
if (absolutePath.startsWith(coll.path + '/') || absolutePath === coll.path) {
|
|
568
|
+
// Extract relative path
|
|
569
|
+
const relativePath = absolutePath.startsWith(coll.path + '/')
|
|
570
|
+
? absolutePath.slice(coll.path.length + 1)
|
|
571
|
+
: '';
|
|
572
|
+
|
|
573
|
+
// Verify this document exists in the database
|
|
574
|
+
const doc = db.prepare(`
|
|
575
|
+
SELECT d.path
|
|
576
|
+
FROM documents d
|
|
577
|
+
WHERE d.collection = ? AND d.path = ? AND d.active = 1
|
|
578
|
+
LIMIT 1
|
|
579
|
+
`).get(coll.name, relativePath) as { path: string } | null;
|
|
580
|
+
|
|
581
|
+
if (doc) {
|
|
582
|
+
return buildVirtualPath(coll.name, relativePath);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
return null;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
// =============================================================================
|
|
591
|
+
// Database initialization
|
|
592
|
+
// =============================================================================
|
|
593
|
+
|
|
594
|
+
function setSQLiteFromBrewPrefixEnv(): void {
|
|
595
|
+
const candidates: string[] = [];
|
|
596
|
+
|
|
597
|
+
if (process.platform === "darwin") {
|
|
598
|
+
// Use BREW_PREFIX for non-standard Homebrew installs (common on corporate Macs).
|
|
599
|
+
const brewPrefix = Bun.env.BREW_PREFIX || Bun.env.HOMEBREW_PREFIX;
|
|
600
|
+
if (brewPrefix) {
|
|
601
|
+
// Homebrew can place SQLite in opt/sqlite (keg-only) or directly under the prefix.
|
|
602
|
+
candidates.push(`${brewPrefix}/opt/sqlite/lib/libsqlite3.dylib`);
|
|
603
|
+
candidates.push(`${brewPrefix}/lib/libsqlite3.dylib`);
|
|
604
|
+
} else {
|
|
605
|
+
candidates.push("/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib");
|
|
606
|
+
candidates.push("/usr/local/opt/sqlite/lib/libsqlite3.dylib");
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
for (const candidate of candidates) {
|
|
611
|
+
try {
|
|
612
|
+
if (statSync(candidate).size > 0) {
|
|
613
|
+
Database.setCustomSQLite(candidate);
|
|
614
|
+
return;
|
|
615
|
+
}
|
|
616
|
+
} catch { }
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
setSQLiteFromBrewPrefixEnv();
|
|
621
|
+
|
|
622
|
+
function createSqliteVecUnavailableError(reason: string): Error {
|
|
623
|
+
return new Error(
|
|
624
|
+
"sqlite-vec extension is unavailable. " +
|
|
625
|
+
`${reason}. ` +
|
|
626
|
+
"Install Homebrew SQLite so the sqlite-vec extension can be loaded, " +
|
|
627
|
+
"and set BREW_PREFIX if Homebrew is installed in a non-standard location."
|
|
628
|
+
);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
function getErrorMessage(err: unknown): string {
|
|
632
|
+
return err instanceof Error ? err.message : String(err);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
export function verifySqliteVecLoaded(db: Database): void {
|
|
636
|
+
try {
|
|
637
|
+
const row = db.prepare(`SELECT vec_version() AS version`).get() as { version?: string } | null;
|
|
638
|
+
if (!row?.version || typeof row.version !== "string") {
|
|
639
|
+
throw new Error("vec_version() returned no version");
|
|
640
|
+
}
|
|
641
|
+
} catch (err) {
|
|
642
|
+
const message = getErrorMessage(err);
|
|
643
|
+
throw createSqliteVecUnavailableError(`sqlite-vec probe failed (${message})`);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
function initializeDatabase(db: Database): void {
|
|
648
|
+
try {
|
|
649
|
+
sqliteVec.load(db);
|
|
650
|
+
verifySqliteVecLoaded(db);
|
|
651
|
+
} catch (err) {
|
|
652
|
+
const message = getErrorMessage(err);
|
|
653
|
+
|
|
654
|
+
if (message.includes("does not support dynamic extension loading")) {
|
|
655
|
+
throw createSqliteVecUnavailableError("SQLite build does not support dynamic extension loading");
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
if (message.includes("sqlite-vec extension is unavailable")) {
|
|
659
|
+
throw err;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
throw err;
|
|
663
|
+
}
|
|
664
|
+
db.exec("PRAGMA journal_mode = WAL");
|
|
665
|
+
db.exec("PRAGMA foreign_keys = ON");
|
|
666
|
+
|
|
667
|
+
// Drop legacy tables that are now managed in YAML
|
|
668
|
+
db.exec(`DROP TABLE IF EXISTS path_contexts`);
|
|
669
|
+
db.exec(`DROP TABLE IF EXISTS collections`);
|
|
670
|
+
|
|
671
|
+
// Content-addressable storage - the source of truth for document content
|
|
672
|
+
db.exec(`
|
|
673
|
+
CREATE TABLE IF NOT EXISTS content (
|
|
674
|
+
hash TEXT PRIMARY KEY,
|
|
675
|
+
doc TEXT NOT NULL,
|
|
676
|
+
created_at TEXT NOT NULL
|
|
677
|
+
)
|
|
678
|
+
`);
|
|
679
|
+
|
|
680
|
+
// Documents table - file system layer mapping virtual paths to content hashes
|
|
681
|
+
// Collections are now managed in ~/.config/qmd/index.yml
|
|
682
|
+
db.exec(`
|
|
683
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
684
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
685
|
+
collection TEXT NOT NULL,
|
|
686
|
+
path TEXT NOT NULL,
|
|
687
|
+
title TEXT NOT NULL,
|
|
688
|
+
hash TEXT NOT NULL,
|
|
689
|
+
created_at TEXT NOT NULL,
|
|
690
|
+
modified_at TEXT NOT NULL,
|
|
691
|
+
active INTEGER NOT NULL DEFAULT 1,
|
|
692
|
+
FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
|
|
693
|
+
UNIQUE(collection, path)
|
|
694
|
+
)
|
|
695
|
+
`);
|
|
696
|
+
|
|
697
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active)`);
|
|
698
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash)`);
|
|
699
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active)`);
|
|
700
|
+
|
|
701
|
+
// Cache table for LLM API calls
|
|
702
|
+
db.exec(`
|
|
703
|
+
CREATE TABLE IF NOT EXISTS llm_cache (
|
|
704
|
+
hash TEXT PRIMARY KEY,
|
|
705
|
+
result TEXT NOT NULL,
|
|
706
|
+
created_at TEXT NOT NULL
|
|
707
|
+
)
|
|
708
|
+
`);
|
|
709
|
+
|
|
710
|
+
// Content vectors
|
|
711
|
+
const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
|
|
712
|
+
const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
|
|
713
|
+
if (cvInfo.length > 0 && !hasSeqColumn) {
|
|
714
|
+
db.exec(`DROP TABLE IF EXISTS content_vectors`);
|
|
715
|
+
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
716
|
+
}
|
|
717
|
+
db.exec(`
|
|
718
|
+
CREATE TABLE IF NOT EXISTS content_vectors (
|
|
719
|
+
hash TEXT NOT NULL,
|
|
720
|
+
seq INTEGER NOT NULL DEFAULT 0,
|
|
721
|
+
pos INTEGER NOT NULL DEFAULT 0,
|
|
722
|
+
model TEXT NOT NULL,
|
|
723
|
+
embedded_at TEXT NOT NULL,
|
|
724
|
+
PRIMARY KEY (hash, seq)
|
|
725
|
+
)
|
|
726
|
+
`);
|
|
727
|
+
|
|
728
|
+
// FTS - index filepath (collection/path), title, and content
|
|
729
|
+
db.exec(`
|
|
730
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
|
|
731
|
+
filepath, title, body,
|
|
732
|
+
tokenize='porter unicode61'
|
|
733
|
+
)
|
|
734
|
+
`);
|
|
735
|
+
|
|
736
|
+
// Triggers to keep FTS in sync
|
|
737
|
+
db.exec(`
|
|
738
|
+
CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
|
|
739
|
+
WHEN new.active = 1
|
|
740
|
+
BEGIN
|
|
741
|
+
INSERT INTO documents_fts(rowid, filepath, title, body)
|
|
742
|
+
SELECT
|
|
743
|
+
new.id,
|
|
744
|
+
new.collection || '/' || new.path,
|
|
745
|
+
new.title,
|
|
746
|
+
(SELECT doc FROM content WHERE hash = new.hash)
|
|
747
|
+
WHERE new.active = 1;
|
|
748
|
+
END
|
|
749
|
+
`);
|
|
750
|
+
|
|
751
|
+
db.exec(`
|
|
752
|
+
CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
|
|
753
|
+
DELETE FROM documents_fts WHERE rowid = old.id;
|
|
754
|
+
END
|
|
755
|
+
`);
|
|
756
|
+
|
|
757
|
+
db.exec(`
|
|
758
|
+
CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
|
|
759
|
+
BEGIN
|
|
760
|
+
-- Delete from FTS if no longer active
|
|
761
|
+
DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
|
|
762
|
+
|
|
763
|
+
-- Update FTS if still/newly active
|
|
764
|
+
INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
|
|
765
|
+
SELECT
|
|
766
|
+
new.id,
|
|
767
|
+
new.collection || '/' || new.path,
|
|
768
|
+
new.title,
|
|
769
|
+
(SELECT doc FROM content WHERE hash = new.hash)
|
|
770
|
+
WHERE new.active = 1;
|
|
771
|
+
END
|
|
772
|
+
`);
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
function ensureVecTableInternal(db: Database, dimensions: number): void {
|
|
777
|
+
const tableInfo = db.prepare(`SELECT sql FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get() as { sql: string } | null;
|
|
778
|
+
if (tableInfo) {
|
|
779
|
+
const match = tableInfo.sql.match(/float\[(\d+)\]/);
|
|
780
|
+
const hasHashSeq = tableInfo.sql.includes('hash_seq');
|
|
781
|
+
const hasCosine = tableInfo.sql.includes('distance_metric=cosine');
|
|
782
|
+
const existingDims = match?.[1] ? parseInt(match[1], 10) : null;
|
|
783
|
+
if (existingDims === dimensions && hasHashSeq && hasCosine) return;
|
|
784
|
+
// Table exists but wrong schema - need to rebuild
|
|
785
|
+
db.exec("DROP TABLE IF EXISTS vectors_vec");
|
|
786
|
+
}
|
|
787
|
+
db.exec(`CREATE VIRTUAL TABLE vectors_vec USING vec0(hash_seq TEXT PRIMARY KEY, embedding float[${dimensions}] distance_metric=cosine)`);
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
// =============================================================================
|
|
791
|
+
// Store Factory
|
|
792
|
+
// =============================================================================
|
|
793
|
+
|
|
794
|
+
export type Store = {
|
|
795
|
+
db: Database;
|
|
796
|
+
dbPath: string;
|
|
797
|
+
close: () => void;
|
|
798
|
+
ensureVecTable: (dimensions: number) => void;
|
|
799
|
+
|
|
800
|
+
// Index health
|
|
801
|
+
getHashesNeedingEmbedding: () => number;
|
|
802
|
+
getIndexHealth: () => IndexHealthInfo;
|
|
803
|
+
getStatus: () => IndexStatus;
|
|
804
|
+
|
|
805
|
+
// Caching
|
|
806
|
+
getCacheKey: typeof getCacheKey;
|
|
807
|
+
getCachedResult: (cacheKey: string) => string | null;
|
|
808
|
+
setCachedResult: (cacheKey: string, result: string) => void;
|
|
809
|
+
clearCache: () => void;
|
|
810
|
+
|
|
811
|
+
// Cleanup and maintenance
|
|
812
|
+
deleteLLMCache: () => number;
|
|
813
|
+
deleteInactiveDocuments: () => number;
|
|
814
|
+
cleanupOrphanedContent: () => number;
|
|
815
|
+
cleanupOrphanedVectors: () => number;
|
|
816
|
+
vacuumDatabase: () => void;
|
|
817
|
+
|
|
818
|
+
// Context
|
|
819
|
+
getContextForFile: (filepath: string) => string | null;
|
|
820
|
+
getContextForPath: (collectionName: string, path: string) => string | null;
|
|
821
|
+
getCollectionByName: (name: string) => { name: string; pwd: string; glob_pattern: string } | null;
|
|
822
|
+
getCollectionsWithoutContext: () => { name: string; pwd: string; doc_count: number }[];
|
|
823
|
+
getTopLevelPathsWithoutContext: (collectionName: string) => string[];
|
|
824
|
+
|
|
825
|
+
// Virtual paths
|
|
826
|
+
parseVirtualPath: typeof parseVirtualPath;
|
|
827
|
+
buildVirtualPath: typeof buildVirtualPath;
|
|
828
|
+
isVirtualPath: typeof isVirtualPath;
|
|
829
|
+
resolveVirtualPath: (virtualPath: string) => string | null;
|
|
830
|
+
toVirtualPath: (absolutePath: string) => string | null;
|
|
831
|
+
|
|
832
|
+
// Search
|
|
833
|
+
searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
|
|
834
|
+
searchVec: (query: string, model: string, limit?: number, collectionName?: string) => Promise<SearchResult[]>;
|
|
835
|
+
|
|
836
|
+
// Query expansion & reranking
|
|
837
|
+
expandQuery: (query: string, model?: string) => Promise<ExpandedQuery[]>;
|
|
838
|
+
rerank: (query: string, documents: { file: string; text: string }[], model?: string) => Promise<{ file: string; score: number }[]>;
|
|
839
|
+
|
|
840
|
+
// Document retrieval
|
|
841
|
+
findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
|
|
842
|
+
getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => string | null;
|
|
843
|
+
findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => { docs: MultiGetResult[]; errors: string[] };
|
|
844
|
+
|
|
845
|
+
// Fuzzy matching and docid lookup
|
|
846
|
+
findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
|
|
847
|
+
matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
|
|
848
|
+
findDocumentByDocid: (docid: string) => { filepath: string; hash: string } | null;
|
|
849
|
+
|
|
850
|
+
// Document indexing operations
|
|
851
|
+
insertContent: (hash: string, content: string, createdAt: string) => void;
|
|
852
|
+
insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
|
|
853
|
+
findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
|
|
854
|
+
updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
|
|
855
|
+
updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
|
|
856
|
+
deactivateDocument: (collectionName: string, path: string) => void;
|
|
857
|
+
getActiveDocumentPaths: (collectionName: string) => string[];
|
|
858
|
+
|
|
859
|
+
// Vector/embedding operations
|
|
860
|
+
getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
|
|
861
|
+
clearAllEmbeddings: () => void;
|
|
862
|
+
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
|
|
863
|
+
};
|
|
864
|
+
|
|
865
|
+
/**
|
|
866
|
+
* Create a new store instance with the given database path.
|
|
867
|
+
* If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
|
|
868
|
+
*
|
|
869
|
+
* @param dbPath - Path to the SQLite database file
|
|
870
|
+
* @returns Store instance with all methods bound to the database
|
|
871
|
+
*/
|
|
872
|
+
export function createStore(dbPath?: string): Store {
|
|
873
|
+
const resolvedPath = dbPath || getDefaultDbPath();
|
|
874
|
+
const db = new Database(resolvedPath);
|
|
875
|
+
initializeDatabase(db);
|
|
876
|
+
|
|
877
|
+
return {
|
|
878
|
+
db,
|
|
879
|
+
dbPath: resolvedPath,
|
|
880
|
+
close: () => db.close(),
|
|
881
|
+
ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
|
|
882
|
+
|
|
883
|
+
// Index health
|
|
884
|
+
getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
|
|
885
|
+
getIndexHealth: () => getIndexHealth(db),
|
|
886
|
+
getStatus: () => getStatus(db),
|
|
887
|
+
|
|
888
|
+
// Caching
|
|
889
|
+
getCacheKey,
|
|
890
|
+
getCachedResult: (cacheKey: string) => getCachedResult(db, cacheKey),
|
|
891
|
+
setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
|
|
892
|
+
clearCache: () => clearCache(db),
|
|
893
|
+
|
|
894
|
+
// Cleanup and maintenance
|
|
895
|
+
deleteLLMCache: () => deleteLLMCache(db),
|
|
896
|
+
deleteInactiveDocuments: () => deleteInactiveDocuments(db),
|
|
897
|
+
cleanupOrphanedContent: () => cleanupOrphanedContent(db),
|
|
898
|
+
cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
|
|
899
|
+
vacuumDatabase: () => vacuumDatabase(db),
|
|
900
|
+
|
|
901
|
+
// Context
|
|
902
|
+
getContextForFile: (filepath: string) => getContextForFile(db, filepath),
|
|
903
|
+
getContextForPath: (collectionName: string, path: string) => getContextForPath(db, collectionName, path),
|
|
904
|
+
getCollectionByName: (name: string) => getCollectionByName(db, name),
|
|
905
|
+
getCollectionsWithoutContext: () => getCollectionsWithoutContext(db),
|
|
906
|
+
getTopLevelPathsWithoutContext: (collectionName: string) => getTopLevelPathsWithoutContext(db, collectionName),
|
|
907
|
+
|
|
908
|
+
// Virtual paths
|
|
909
|
+
parseVirtualPath,
|
|
910
|
+
buildVirtualPath,
|
|
911
|
+
isVirtualPath,
|
|
912
|
+
resolveVirtualPath: (virtualPath: string) => resolveVirtualPath(db, virtualPath),
|
|
913
|
+
toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
|
|
914
|
+
|
|
915
|
+
// Search
|
|
916
|
+
searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
|
|
917
|
+
searchVec: (query: string, model: string, limit?: number, collectionName?: string) => searchVec(db, query, model, limit, collectionName),
|
|
918
|
+
|
|
919
|
+
// Query expansion & reranking
|
|
920
|
+
expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
|
|
921
|
+
rerank: (query: string, documents: { file: string; text: string }[], model?: string) => rerank(query, documents, model, db),
|
|
922
|
+
|
|
923
|
+
// Document retrieval
|
|
924
|
+
findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
|
|
925
|
+
getDocumentBody: (doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number) => getDocumentBody(db, doc, fromLine, maxLines),
|
|
926
|
+
findDocuments: (pattern: string, options?: { includeBody?: boolean; maxBytes?: number }) => findDocuments(db, pattern, options),
|
|
927
|
+
|
|
928
|
+
// Fuzzy matching and docid lookup
|
|
929
|
+
findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
|
|
930
|
+
matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
|
|
931
|
+
findDocumentByDocid: (docid: string) => findDocumentByDocid(db, docid),
|
|
932
|
+
|
|
933
|
+
// Document indexing operations
|
|
934
|
+
insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
|
|
935
|
+
insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
|
|
936
|
+
findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
|
|
937
|
+
updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
|
|
938
|
+
updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
|
|
939
|
+
deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
|
|
940
|
+
getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName),
|
|
941
|
+
|
|
942
|
+
// Vector/embedding operations
|
|
943
|
+
getHashesForEmbedding: () => getHashesForEmbedding(db),
|
|
944
|
+
clearAllEmbeddings: () => clearAllEmbeddings(db),
|
|
945
|
+
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
|
|
946
|
+
};
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
// =============================================================================
|
|
950
|
+
// Core Document Type
|
|
951
|
+
// =============================================================================
|
|
952
|
+
|
|
953
|
+
/**
|
|
954
|
+
* Unified document result type with all metadata.
|
|
955
|
+
* Body is optional - use getDocumentBody() to load it separately if needed.
|
|
956
|
+
*/
|
|
957
|
+
export type DocumentResult = {
|
|
958
|
+
filepath: string; // Full filesystem path
|
|
959
|
+
displayPath: string; // Short display path (e.g., "docs/readme.md")
|
|
960
|
+
title: string; // Document title (from first heading or filename)
|
|
961
|
+
context: string | null; // Folder context description if configured
|
|
962
|
+
hash: string; // Content hash for caching/change detection
|
|
963
|
+
docid: string; // Short docid (first 6 chars of hash) for quick reference
|
|
964
|
+
collectionName: string; // Parent collection name
|
|
965
|
+
modifiedAt: string; // Last modification timestamp
|
|
966
|
+
bodyLength: number; // Body length in bytes (useful before loading)
|
|
967
|
+
body?: string; // Document body (optional, load with getDocumentBody)
|
|
968
|
+
};
|
|
969
|
+
|
|
970
|
+
/**
|
|
971
|
+
* Extract short docid from a full hash (first 6 characters).
|
|
972
|
+
*/
|
|
973
|
+
export function getDocid(hash: string): string {
|
|
974
|
+
return hash.slice(0, 6);
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
/**
|
|
978
|
+
* Handelize a filename to be more token-friendly.
|
|
979
|
+
* - Convert triple underscore `___` to `/` (folder separator)
|
|
980
|
+
* - Convert to lowercase
|
|
981
|
+
* - Replace sequences of non-word chars (except /) with single dash
|
|
982
|
+
* - Remove leading/trailing dashes from path segments
|
|
983
|
+
* - Preserve folder structure (a/b/c/d.md stays structured)
|
|
984
|
+
* - Preserve file extension
|
|
985
|
+
*/
|
|
986
|
+
export function handelize(path: string): string {
|
|
987
|
+
if (!path || path.trim() === '') {
|
|
988
|
+
throw new Error('handelize: path cannot be empty');
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
// Allow route-style "$" filenames while still rejecting paths with no usable content.
|
|
992
|
+
const segments = path.split('/').filter(Boolean);
|
|
993
|
+
const lastSegment = segments[segments.length - 1] || '';
|
|
994
|
+
const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
|
|
995
|
+
const hasValidContent = /[\p{L}\p{N}$]/u.test(filenameWithoutExt);
|
|
996
|
+
if (!hasValidContent) {
|
|
997
|
+
throw new Error(`handelize: path "${path}" has no valid filename content`);
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
const result = path
|
|
1001
|
+
.replace(/___/g, '/') // Triple underscore becomes folder separator
|
|
1002
|
+
.toLowerCase()
|
|
1003
|
+
.split('/')
|
|
1004
|
+
.map((segment, idx, arr) => {
|
|
1005
|
+
const isLastSegment = idx === arr.length - 1;
|
|
1006
|
+
|
|
1007
|
+
if (isLastSegment) {
|
|
1008
|
+
// For the filename (last segment), preserve the extension
|
|
1009
|
+
const extMatch = segment.match(/(\.[a-z0-9]+)$/i);
|
|
1010
|
+
const ext = extMatch ? extMatch[1] : '';
|
|
1011
|
+
const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
|
|
1012
|
+
|
|
1013
|
+
const cleanedName = nameWithoutExt
|
|
1014
|
+
.replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep route marker "$", dash-separate other chars
|
|
1015
|
+
.replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
|
|
1016
|
+
|
|
1017
|
+
return cleanedName + ext;
|
|
1018
|
+
} else {
|
|
1019
|
+
// For directories, just clean normally
|
|
1020
|
+
return segment
|
|
1021
|
+
.replace(/[^\p{L}\p{N}$]+/gu, '-')
|
|
1022
|
+
.replace(/^-+|-+$/g, '');
|
|
1023
|
+
}
|
|
1024
|
+
})
|
|
1025
|
+
.filter(Boolean)
|
|
1026
|
+
.join('/');
|
|
1027
|
+
|
|
1028
|
+
if (!result) {
|
|
1029
|
+
throw new Error(`handelize: path "${path}" resulted in empty string after processing`);
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
return result;
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
/**
|
|
1036
|
+
* Search result extends DocumentResult with score and source info
|
|
1037
|
+
*/
|
|
1038
|
+
export type SearchResult = DocumentResult & {
|
|
1039
|
+
score: number; // Relevance score (0-1)
|
|
1040
|
+
source: "fts" | "vec"; // Search source (full-text or vector)
|
|
1041
|
+
chunkPos?: number; // Character position of matching chunk (for vector search)
|
|
1042
|
+
};
|
|
1043
|
+
|
|
1044
|
+
/**
|
|
1045
|
+
* Ranked result for RRF fusion (simplified, used internally)
|
|
1046
|
+
*/
|
|
1047
|
+
export type RankedResult = {
|
|
1048
|
+
file: string;
|
|
1049
|
+
displayPath: string;
|
|
1050
|
+
title: string;
|
|
1051
|
+
body: string;
|
|
1052
|
+
score: number;
|
|
1053
|
+
};
|
|
1054
|
+
|
|
1055
|
+
/**
|
|
1056
|
+
* Error result when document is not found
|
|
1057
|
+
*/
|
|
1058
|
+
export type DocumentNotFound = {
|
|
1059
|
+
error: "not_found";
|
|
1060
|
+
query: string;
|
|
1061
|
+
similarFiles: string[];
|
|
1062
|
+
};
|
|
1063
|
+
|
|
1064
|
+
/**
|
|
1065
|
+
* Result from multi-get operations
|
|
1066
|
+
*/
|
|
1067
|
+
export type MultiGetResult = {
|
|
1068
|
+
doc: DocumentResult;
|
|
1069
|
+
skipped: false;
|
|
1070
|
+
} | {
|
|
1071
|
+
doc: Pick<DocumentResult, "filepath" | "displayPath">;
|
|
1072
|
+
skipped: true;
|
|
1073
|
+
skipReason: string;
|
|
1074
|
+
};
|
|
1075
|
+
|
|
1076
|
+
export type CollectionInfo = {
|
|
1077
|
+
name: string;
|
|
1078
|
+
path: string;
|
|
1079
|
+
pattern: string;
|
|
1080
|
+
documents: number;
|
|
1081
|
+
lastUpdated: string;
|
|
1082
|
+
};
|
|
1083
|
+
|
|
1084
|
+
export type IndexStatus = {
|
|
1085
|
+
totalDocuments: number;
|
|
1086
|
+
needsEmbedding: number;
|
|
1087
|
+
hasVectorIndex: boolean;
|
|
1088
|
+
collections: CollectionInfo[];
|
|
1089
|
+
};
|
|
1090
|
+
|
|
1091
|
+
// =============================================================================
|
|
1092
|
+
// Index health
|
|
1093
|
+
// =============================================================================
|
|
1094
|
+
|
|
1095
|
+
export function getHashesNeedingEmbedding(db: Database): number {
|
|
1096
|
+
const result = db.prepare(`
|
|
1097
|
+
SELECT COUNT(DISTINCT d.hash) as count
|
|
1098
|
+
FROM documents d
|
|
1099
|
+
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
|
1100
|
+
WHERE d.active = 1 AND v.hash IS NULL
|
|
1101
|
+
`).get() as { count: number };
|
|
1102
|
+
return result.count;
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
export type IndexHealthInfo = {
|
|
1106
|
+
needsEmbedding: number;
|
|
1107
|
+
totalDocs: number;
|
|
1108
|
+
daysStale: number | null;
|
|
1109
|
+
};
|
|
1110
|
+
|
|
1111
|
+
export function getIndexHealth(db: Database): IndexHealthInfo {
|
|
1112
|
+
const needsEmbedding = getHashesNeedingEmbedding(db);
|
|
1113
|
+
const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
|
|
1114
|
+
|
|
1115
|
+
const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
|
|
1116
|
+
let daysStale: number | null = null;
|
|
1117
|
+
if (mostRecent?.latest) {
|
|
1118
|
+
const lastUpdate = new Date(mostRecent.latest);
|
|
1119
|
+
daysStale = Math.floor((Date.now() - lastUpdate.getTime()) / (24 * 60 * 60 * 1000));
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
return { needsEmbedding, totalDocs, daysStale };
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
// =============================================================================
|
|
1126
|
+
// Caching
|
|
1127
|
+
// =============================================================================
|
|
1128
|
+
|
|
1129
|
+
export function getCacheKey(url: string, body: object): string {
|
|
1130
|
+
const hash = new Bun.CryptoHasher("sha256");
|
|
1131
|
+
hash.update(url);
|
|
1132
|
+
hash.update(JSON.stringify(body));
|
|
1133
|
+
return hash.digest("hex");
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
export function getCachedResult(db: Database, cacheKey: string): string | null {
|
|
1137
|
+
const row = db.prepare(`SELECT result FROM llm_cache WHERE hash = ?`).get(cacheKey) as { result: string } | null;
|
|
1138
|
+
return row?.result || null;
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
export function setCachedResult(db: Database, cacheKey: string, result: string): void {
|
|
1142
|
+
const now = new Date().toISOString();
|
|
1143
|
+
db.prepare(`INSERT OR REPLACE INTO llm_cache (hash, result, created_at) VALUES (?, ?, ?)`).run(cacheKey, result, now);
|
|
1144
|
+
if (Math.random() < 0.01) {
|
|
1145
|
+
db.exec(`DELETE FROM llm_cache WHERE hash NOT IN (SELECT hash FROM llm_cache ORDER BY created_at DESC LIMIT 1000)`);
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
export function clearCache(db: Database): void {
|
|
1150
|
+
db.exec(`DELETE FROM llm_cache`);
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
// =============================================================================
|
|
1154
|
+
// Cleanup and maintenance operations
|
|
1155
|
+
// =============================================================================
|
|
1156
|
+
|
|
1157
|
+
/**
|
|
1158
|
+
* Delete cached LLM API responses.
|
|
1159
|
+
* Returns the number of cached responses deleted.
|
|
1160
|
+
*/
|
|
1161
|
+
export function deleteLLMCache(db: Database): number {
|
|
1162
|
+
const result = db.prepare(`DELETE FROM llm_cache`).run();
|
|
1163
|
+
return result.changes;
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
/**
|
|
1167
|
+
* Remove inactive document records (active = 0).
|
|
1168
|
+
* Returns the number of inactive documents deleted.
|
|
1169
|
+
*/
|
|
1170
|
+
export function deleteInactiveDocuments(db: Database): number {
|
|
1171
|
+
const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
|
|
1172
|
+
return result.changes;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
/**
|
|
1176
|
+
* Remove orphaned content hashes that are not referenced by any active document.
|
|
1177
|
+
* Returns the number of orphaned content hashes deleted.
|
|
1178
|
+
*/
|
|
1179
|
+
export function cleanupOrphanedContent(db: Database): number {
|
|
1180
|
+
const result = db.prepare(`
|
|
1181
|
+
DELETE FROM content
|
|
1182
|
+
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
|
|
1183
|
+
`).run();
|
|
1184
|
+
return result.changes;
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
/**
|
|
1188
|
+
* Remove orphaned vector embeddings that are not referenced by any active document.
|
|
1189
|
+
* Returns the number of orphaned embedding chunks deleted.
|
|
1190
|
+
*/
|
|
1191
|
+
export function cleanupOrphanedVectors(db: Database): number {
|
|
1192
|
+
// Check if vectors_vec table exists
|
|
1193
|
+
const tableExists = db.prepare(`
|
|
1194
|
+
SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
|
|
1195
|
+
`).get();
|
|
1196
|
+
|
|
1197
|
+
if (!tableExists) {
|
|
1198
|
+
return 0;
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
// Count orphaned vectors first
|
|
1202
|
+
const countResult = db.prepare(`
|
|
1203
|
+
SELECT COUNT(*) as c FROM content_vectors cv
|
|
1204
|
+
WHERE NOT EXISTS (
|
|
1205
|
+
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
|
1206
|
+
)
|
|
1207
|
+
`).get() as { c: number };
|
|
1208
|
+
|
|
1209
|
+
if (countResult.c === 0) {
|
|
1210
|
+
return 0;
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
// Delete from vectors_vec first
|
|
1214
|
+
db.exec(`
|
|
1215
|
+
DELETE FROM vectors_vec WHERE hash_seq IN (
|
|
1216
|
+
SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
|
|
1217
|
+
WHERE NOT EXISTS (
|
|
1218
|
+
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
|
1219
|
+
)
|
|
1220
|
+
)
|
|
1221
|
+
`);
|
|
1222
|
+
|
|
1223
|
+
// Delete from content_vectors
|
|
1224
|
+
db.exec(`
|
|
1225
|
+
DELETE FROM content_vectors WHERE hash NOT IN (
|
|
1226
|
+
SELECT hash FROM documents WHERE active = 1
|
|
1227
|
+
)
|
|
1228
|
+
`);
|
|
1229
|
+
|
|
1230
|
+
return countResult.c;
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1233
|
+
/**
|
|
1234
|
+
* Run VACUUM to reclaim unused space in the database.
|
|
1235
|
+
* This operation rebuilds the database file to eliminate fragmentation.
|
|
1236
|
+
*/
|
|
1237
|
+
export function vacuumDatabase(db: Database): void {
|
|
1238
|
+
db.exec(`VACUUM`);
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
// =============================================================================
|
|
1242
|
+
// Document helpers
|
|
1243
|
+
// =============================================================================
|
|
1244
|
+
|
|
1245
|
+
export async function hashContent(content: string): Promise<string> {
|
|
1246
|
+
const hash = new Bun.CryptoHasher("sha256");
|
|
1247
|
+
hash.update(content);
|
|
1248
|
+
return hash.digest("hex");
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
const titleExtractors: Record<string, (content: string) => string | null> = {
|
|
1252
|
+
'.md': (content) => {
|
|
1253
|
+
const match = content.match(/^##?\s+(.+)$/m);
|
|
1254
|
+
if (match) {
|
|
1255
|
+
const title = (match[1] ?? "").trim();
|
|
1256
|
+
if (title === "📝 Notes" || title === "Notes") {
|
|
1257
|
+
const nextMatch = content.match(/^##\s+(.+)$/m);
|
|
1258
|
+
if (nextMatch?.[1]) return nextMatch[1].trim();
|
|
1259
|
+
}
|
|
1260
|
+
return title;
|
|
1261
|
+
}
|
|
1262
|
+
return null;
|
|
1263
|
+
},
|
|
1264
|
+
'.org': (content) => {
|
|
1265
|
+
const titleProp = content.match(/^#\+TITLE:\s*(.+)$/im);
|
|
1266
|
+
if (titleProp?.[1]) return titleProp[1].trim();
|
|
1267
|
+
const heading = content.match(/^\*+\s+(.+)$/m);
|
|
1268
|
+
if (heading?.[1]) return heading[1].trim();
|
|
1269
|
+
return null;
|
|
1270
|
+
},
|
|
1271
|
+
};
|
|
1272
|
+
|
|
1273
|
+
export function extractTitle(content: string, filename: string): string {
|
|
1274
|
+
const ext = filename.slice(filename.lastIndexOf('.')).toLowerCase();
|
|
1275
|
+
const extractor = titleExtractors[ext];
|
|
1276
|
+
if (extractor) {
|
|
1277
|
+
const title = extractor(content);
|
|
1278
|
+
if (title) return title;
|
|
1279
|
+
}
|
|
1280
|
+
return filename.replace(/\.[^.]+$/, "").split("/").pop() || filename;
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
// =============================================================================
|
|
1284
|
+
// Document indexing operations
|
|
1285
|
+
// =============================================================================
|
|
1286
|
+
|
|
1287
|
+
/**
|
|
1288
|
+
* Insert content into the content table (content-addressable storage).
|
|
1289
|
+
* Uses INSERT OR IGNORE so duplicate hashes are skipped.
|
|
1290
|
+
*/
|
|
1291
|
+
export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
|
|
1292
|
+
db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
|
|
1293
|
+
.run(hash, content, createdAt);
|
|
1294
|
+
}
|
|
1295
|
+
|
|
1296
|
+
/**
|
|
1297
|
+
* Insert a new document into the documents table.
|
|
1298
|
+
*/
|
|
1299
|
+
export function insertDocument(
|
|
1300
|
+
db: Database,
|
|
1301
|
+
collectionName: string,
|
|
1302
|
+
path: string,
|
|
1303
|
+
title: string,
|
|
1304
|
+
hash: string,
|
|
1305
|
+
createdAt: string,
|
|
1306
|
+
modifiedAt: string
|
|
1307
|
+
): void {
|
|
1308
|
+
db.prepare(`
|
|
1309
|
+
INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
|
|
1310
|
+
VALUES (?, ?, ?, ?, ?, ?, 1)
|
|
1311
|
+
ON CONFLICT(collection, path) DO UPDATE SET
|
|
1312
|
+
title = excluded.title,
|
|
1313
|
+
hash = excluded.hash,
|
|
1314
|
+
modified_at = excluded.modified_at,
|
|
1315
|
+
active = 1
|
|
1316
|
+
`).run(collectionName, path, title, hash, createdAt, modifiedAt);
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1319
|
+
/**
|
|
1320
|
+
* Find an active document by collection name and path.
|
|
1321
|
+
*/
|
|
1322
|
+
export function findActiveDocument(
|
|
1323
|
+
db: Database,
|
|
1324
|
+
collectionName: string,
|
|
1325
|
+
path: string
|
|
1326
|
+
): { id: number; hash: string; title: string } | null {
|
|
1327
|
+
return db.prepare(`
|
|
1328
|
+
SELECT id, hash, title FROM documents
|
|
1329
|
+
WHERE collection = ? AND path = ? AND active = 1
|
|
1330
|
+
`).get(collectionName, path) as { id: number; hash: string; title: string } | null;
|
|
1331
|
+
}
|
|
1332
|
+
|
|
1333
|
+
/**
|
|
1334
|
+
* Update the title and modified_at timestamp for a document.
|
|
1335
|
+
*/
|
|
1336
|
+
export function updateDocumentTitle(
|
|
1337
|
+
db: Database,
|
|
1338
|
+
documentId: number,
|
|
1339
|
+
title: string,
|
|
1340
|
+
modifiedAt: string
|
|
1341
|
+
): void {
|
|
1342
|
+
db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
|
|
1343
|
+
.run(title, modifiedAt, documentId);
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
/**
|
|
1347
|
+
* Update an existing document's hash, title, and modified_at timestamp.
|
|
1348
|
+
* Used when content changes but the file path stays the same.
|
|
1349
|
+
*/
|
|
1350
|
+
export function updateDocument(
|
|
1351
|
+
db: Database,
|
|
1352
|
+
documentId: number,
|
|
1353
|
+
title: string,
|
|
1354
|
+
hash: string,
|
|
1355
|
+
modifiedAt: string
|
|
1356
|
+
): void {
|
|
1357
|
+
db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
|
|
1358
|
+
.run(title, hash, modifiedAt, documentId);
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
/**
|
|
1362
|
+
* Deactivate a document (mark as inactive but don't delete).
|
|
1363
|
+
*/
|
|
1364
|
+
export function deactivateDocument(db: Database, collectionName: string, path: string): void {
|
|
1365
|
+
db.prepare(`UPDATE documents SET active = 0 WHERE collection = ? AND path = ? AND active = 1`)
|
|
1366
|
+
.run(collectionName, path);
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
/**
|
|
1370
|
+
* Get all active document paths for a collection.
|
|
1371
|
+
*/
|
|
1372
|
+
export function getActiveDocumentPaths(db: Database, collectionName: string): string[] {
|
|
1373
|
+
const rows = db.prepare(`
|
|
1374
|
+
SELECT path FROM documents WHERE collection = ? AND active = 1
|
|
1375
|
+
`).all(collectionName) as { path: string }[];
|
|
1376
|
+
return rows.map(r => r.path);
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
export { formatQueryForEmbedding, formatDocForEmbedding };
|
|
1380
|
+
|
|
1381
|
+
export function chunkDocument(
|
|
1382
|
+
content: string,
|
|
1383
|
+
maxChars: number = CHUNK_SIZE_CHARS,
|
|
1384
|
+
overlapChars: number = CHUNK_OVERLAP_CHARS,
|
|
1385
|
+
windowChars: number = CHUNK_WINDOW_CHARS
|
|
1386
|
+
): { text: string; pos: number }[] {
|
|
1387
|
+
if (content.length <= maxChars) {
|
|
1388
|
+
return [{ text: content, pos: 0 }];
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
// Pre-scan all break points and code fences once
|
|
1392
|
+
const breakPoints = scanBreakPoints(content);
|
|
1393
|
+
const codeFences = findCodeFences(content);
|
|
1394
|
+
|
|
1395
|
+
const chunks: { text: string; pos: number }[] = [];
|
|
1396
|
+
let charPos = 0;
|
|
1397
|
+
|
|
1398
|
+
while (charPos < content.length) {
|
|
1399
|
+
// Calculate target end position for this chunk
|
|
1400
|
+
const targetEndPos = Math.min(charPos + maxChars, content.length);
|
|
1401
|
+
|
|
1402
|
+
let endPos = targetEndPos;
|
|
1403
|
+
|
|
1404
|
+
// If not at the end, find the best break point
|
|
1405
|
+
if (endPos < content.length) {
|
|
1406
|
+
// Find best cutoff using scored algorithm
|
|
1407
|
+
const bestCutoff = findBestCutoff(
|
|
1408
|
+
breakPoints,
|
|
1409
|
+
targetEndPos,
|
|
1410
|
+
windowChars,
|
|
1411
|
+
0.7,
|
|
1412
|
+
codeFences
|
|
1413
|
+
);
|
|
1414
|
+
|
|
1415
|
+
// Only use the cutoff if it's within our current chunk
|
|
1416
|
+
if (bestCutoff > charPos && bestCutoff <= targetEndPos) {
|
|
1417
|
+
endPos = bestCutoff;
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
|
|
1421
|
+
// Ensure we make progress
|
|
1422
|
+
if (endPos <= charPos) {
|
|
1423
|
+
endPos = Math.min(charPos + maxChars, content.length);
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
chunks.push({ text: content.slice(charPos, endPos), pos: charPos });
|
|
1427
|
+
|
|
1428
|
+
// Move forward, but overlap with previous chunk
|
|
1429
|
+
// For last chunk, don't overlap (just go to the end)
|
|
1430
|
+
if (endPos >= content.length) {
|
|
1431
|
+
break;
|
|
1432
|
+
}
|
|
1433
|
+
charPos = endPos - overlapChars;
|
|
1434
|
+
const lastChunkPos = chunks.at(-1)!.pos;
|
|
1435
|
+
if (charPos <= lastChunkPos) {
|
|
1436
|
+
// Prevent infinite loop - move forward at least a bit
|
|
1437
|
+
charPos = endPos;
|
|
1438
|
+
}
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
return chunks;
|
|
1442
|
+
}
|
|
1443
|
+
|
|
1444
|
+
/**
|
|
1445
|
+
* Chunk a document by actual token count using the LLM tokenizer.
|
|
1446
|
+
* More accurate than character-based chunking but requires async.
|
|
1447
|
+
*/
|
|
1448
|
+
export async function chunkDocumentByTokens(
|
|
1449
|
+
content: string,
|
|
1450
|
+
maxTokens: number = CHUNK_SIZE_TOKENS,
|
|
1451
|
+
overlapTokens: number = CHUNK_OVERLAP_TOKENS,
|
|
1452
|
+
windowTokens: number = CHUNK_WINDOW_TOKENS
|
|
1453
|
+
): Promise<{ text: string; pos: number; tokens: number }[]> {
|
|
1454
|
+
const llm = getDefaultLlamaCpp();
|
|
1455
|
+
|
|
1456
|
+
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
|
1457
|
+
// If chunks exceed limit, they'll be re-split with actual ratio
|
|
1458
|
+
const avgCharsPerToken = 3;
|
|
1459
|
+
const maxChars = maxTokens * avgCharsPerToken;
|
|
1460
|
+
const overlapChars = overlapTokens * avgCharsPerToken;
|
|
1461
|
+
const windowChars = windowTokens * avgCharsPerToken;
|
|
1462
|
+
|
|
1463
|
+
// Chunk in character space with conservative estimate
|
|
1464
|
+
let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
|
|
1465
|
+
|
|
1466
|
+
// Tokenize and split any chunks that still exceed limit
|
|
1467
|
+
const results: { text: string; pos: number; tokens: number }[] = [];
|
|
1468
|
+
|
|
1469
|
+
for (const chunk of charChunks) {
|
|
1470
|
+
const tokens = await llm.tokenize(chunk.text);
|
|
1471
|
+
|
|
1472
|
+
if (tokens.length <= maxTokens) {
|
|
1473
|
+
results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
|
|
1474
|
+
} else {
|
|
1475
|
+
// Chunk is still too large - split it further
|
|
1476
|
+
// Use actual token count to estimate better char limit
|
|
1477
|
+
const actualCharsPerToken = chunk.text.length / tokens.length;
|
|
1478
|
+
const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
|
|
1479
|
+
|
|
1480
|
+
const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
|
|
1481
|
+
|
|
1482
|
+
for (const subChunk of subChunks) {
|
|
1483
|
+
const subTokens = await llm.tokenize(subChunk.text);
|
|
1484
|
+
results.push({
|
|
1485
|
+
text: subChunk.text,
|
|
1486
|
+
pos: chunk.pos + subChunk.pos,
|
|
1487
|
+
tokens: subTokens.length,
|
|
1488
|
+
});
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
return results;
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
// =============================================================================
|
|
1497
|
+
// Fuzzy matching
|
|
1498
|
+
// =============================================================================
|
|
1499
|
+
|
|
1500
|
+
function levenshtein(a: string, b: string): number {
|
|
1501
|
+
const m = a.length, n = b.length;
|
|
1502
|
+
if (m === 0) return n;
|
|
1503
|
+
if (n === 0) return m;
|
|
1504
|
+
const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
|
|
1505
|
+
for (let i = 0; i <= m; i++) dp[i]![0] = i;
|
|
1506
|
+
for (let j = 0; j <= n; j++) dp[0]![j] = j;
|
|
1507
|
+
for (let i = 1; i <= m; i++) {
|
|
1508
|
+
for (let j = 1; j <= n; j++) {
|
|
1509
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
1510
|
+
dp[i]![j] = Math.min(
|
|
1511
|
+
dp[i - 1]![j]! + 1,
|
|
1512
|
+
dp[i]![j - 1]! + 1,
|
|
1513
|
+
dp[i - 1]![j - 1]! + cost
|
|
1514
|
+
);
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
return dp[m]![n]!;
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1520
|
+
/**
|
|
1521
|
+
* Normalize a docid input by stripping surrounding quotes and leading #.
|
|
1522
|
+
* Handles: "#abc123", 'abc123', "abc123", #abc123, abc123
|
|
1523
|
+
* Returns the bare hex string.
|
|
1524
|
+
*/
|
|
1525
|
+
export function normalizeDocid(docid: string): string {
|
|
1526
|
+
let normalized = docid.trim();
|
|
1527
|
+
|
|
1528
|
+
// Strip surrounding quotes (single or double)
|
|
1529
|
+
if ((normalized.startsWith('"') && normalized.endsWith('"')) ||
|
|
1530
|
+
(normalized.startsWith("'") && normalized.endsWith("'"))) {
|
|
1531
|
+
normalized = normalized.slice(1, -1);
|
|
1532
|
+
}
|
|
1533
|
+
|
|
1534
|
+
// Strip leading # if present
|
|
1535
|
+
if (normalized.startsWith('#')) {
|
|
1536
|
+
normalized = normalized.slice(1);
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
return normalized;
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
/**
|
|
1543
|
+
* Check if a string looks like a docid reference.
|
|
1544
|
+
* Accepts: #abc123, abc123, "#abc123", "abc123", '#abc123', 'abc123'
|
|
1545
|
+
* Returns true if the normalized form is a valid hex string of 6+ chars.
|
|
1546
|
+
*/
|
|
1547
|
+
export function isDocid(input: string): boolean {
|
|
1548
|
+
const normalized = normalizeDocid(input);
|
|
1549
|
+
// Must be at least 6 hex characters
|
|
1550
|
+
return normalized.length >= 6 && /^[a-f0-9]+$/i.test(normalized);
|
|
1551
|
+
}
|
|
1552
|
+
|
|
1553
|
+
/**
|
|
1554
|
+
* Find a document by its short docid (first 6 characters of hash).
|
|
1555
|
+
* Returns the document's virtual path if found, null otherwise.
|
|
1556
|
+
* If multiple documents match the same short hash (collision), returns the first one.
|
|
1557
|
+
*
|
|
1558
|
+
* Accepts lenient input: #abc123, abc123, "#abc123", "abc123"
|
|
1559
|
+
*/
|
|
1560
|
+
export function findDocumentByDocid(db: Database, docid: string): { filepath: string; hash: string } | null {
|
|
1561
|
+
const shortHash = normalizeDocid(docid);
|
|
1562
|
+
|
|
1563
|
+
if (shortHash.length < 1) return null;
|
|
1564
|
+
|
|
1565
|
+
// Look up documents where hash starts with the short hash
|
|
1566
|
+
const doc = db.prepare(`
|
|
1567
|
+
SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.hash
|
|
1568
|
+
FROM documents d
|
|
1569
|
+
WHERE d.hash LIKE ? AND d.active = 1
|
|
1570
|
+
LIMIT 1
|
|
1571
|
+
`).get(`${shortHash}%`) as { filepath: string; hash: string } | null;
|
|
1572
|
+
|
|
1573
|
+
return doc;
|
|
1574
|
+
}
|
|
1575
|
+
|
|
1576
|
+
export function findSimilarFiles(db: Database, query: string, maxDistance: number = 3, limit: number = 5): string[] {
|
|
1577
|
+
const allFiles = db.prepare(`
|
|
1578
|
+
SELECT d.path
|
|
1579
|
+
FROM documents d
|
|
1580
|
+
WHERE d.active = 1
|
|
1581
|
+
`).all() as { path: string }[];
|
|
1582
|
+
const queryLower = query.toLowerCase();
|
|
1583
|
+
const scored = allFiles
|
|
1584
|
+
.map(f => ({ path: f.path, dist: levenshtein(f.path.toLowerCase(), queryLower) }))
|
|
1585
|
+
.filter(f => f.dist <= maxDistance)
|
|
1586
|
+
.sort((a, b) => a.dist - b.dist)
|
|
1587
|
+
.slice(0, limit);
|
|
1588
|
+
return scored.map(f => f.path);
|
|
1589
|
+
}
|
|
1590
|
+
|
|
1591
|
+
export function matchFilesByGlob(db: Database, pattern: string): { filepath: string; displayPath: string; bodyLength: number }[] {
|
|
1592
|
+
const allFiles = db.prepare(`
|
|
1593
|
+
SELECT
|
|
1594
|
+
'qmd://' || d.collection || '/' || d.path as virtual_path,
|
|
1595
|
+
LENGTH(content.doc) as body_length,
|
|
1596
|
+
d.path,
|
|
1597
|
+
d.collection
|
|
1598
|
+
FROM documents d
|
|
1599
|
+
JOIN content ON content.hash = d.hash
|
|
1600
|
+
WHERE d.active = 1
|
|
1601
|
+
`).all() as { virtual_path: string; body_length: number; path: string; collection: string }[];
|
|
1602
|
+
|
|
1603
|
+
const glob = new Glob(pattern);
|
|
1604
|
+
return allFiles
|
|
1605
|
+
.filter(f => glob.match(f.virtual_path) || glob.match(f.path))
|
|
1606
|
+
.map(f => ({
|
|
1607
|
+
filepath: f.virtual_path, // Virtual path for precise lookup
|
|
1608
|
+
displayPath: f.path, // Relative path for display
|
|
1609
|
+
bodyLength: f.body_length
|
|
1610
|
+
}));
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
// =============================================================================
|
|
1614
|
+
// Context
|
|
1615
|
+
// =============================================================================
|
|
1616
|
+
|
|
1617
|
+
/**
|
|
1618
|
+
* Get context for a file path using hierarchical inheritance.
|
|
1619
|
+
* Contexts are collection-scoped and inherit from parent directories.
|
|
1620
|
+
* For example, context at "/talks" applies to "/talks/2024/keynote.md".
|
|
1621
|
+
*
|
|
1622
|
+
* @param db Database instance (unused - kept for compatibility)
|
|
1623
|
+
* @param collectionName Collection name
|
|
1624
|
+
* @param path Relative path within the collection
|
|
1625
|
+
* @returns Context string or null if no context is defined
|
|
1626
|
+
*/
|
|
1627
|
+
export function getContextForPath(db: Database, collectionName: string, path: string): string | null {
|
|
1628
|
+
const config = collectionsLoadConfig();
|
|
1629
|
+
const coll = getCollection(collectionName);
|
|
1630
|
+
|
|
1631
|
+
if (!coll) return null;
|
|
1632
|
+
|
|
1633
|
+
// Collect ALL matching contexts (global + all path prefixes)
|
|
1634
|
+
const contexts: string[] = [];
|
|
1635
|
+
|
|
1636
|
+
// Add global context if present
|
|
1637
|
+
if (config.global_context) {
|
|
1638
|
+
contexts.push(config.global_context);
|
|
1639
|
+
}
|
|
1640
|
+
|
|
1641
|
+
// Add all matching path contexts (from most general to most specific)
|
|
1642
|
+
if (coll.context) {
|
|
1643
|
+
const normalizedPath = path.startsWith("/") ? path : `/${path}`;
|
|
1644
|
+
|
|
1645
|
+
// Collect all matching prefixes
|
|
1646
|
+
const matchingContexts: { prefix: string; context: string }[] = [];
|
|
1647
|
+
for (const [prefix, context] of Object.entries(coll.context)) {
|
|
1648
|
+
const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
|
|
1649
|
+
if (normalizedPath.startsWith(normalizedPrefix)) {
|
|
1650
|
+
matchingContexts.push({ prefix: normalizedPrefix, context });
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
|
|
1654
|
+
// Sort by prefix length (shortest/most general first)
|
|
1655
|
+
matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
|
|
1656
|
+
|
|
1657
|
+
// Add all matching contexts
|
|
1658
|
+
for (const match of matchingContexts) {
|
|
1659
|
+
contexts.push(match.context);
|
|
1660
|
+
}
|
|
1661
|
+
}
|
|
1662
|
+
|
|
1663
|
+
// Join all contexts with double newline
|
|
1664
|
+
return contexts.length > 0 ? contexts.join('\n\n') : null;
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
/**
|
|
1668
|
+
* Get context for a file path (virtual or filesystem).
|
|
1669
|
+
* Resolves the collection and relative path using the YAML collections config.
|
|
1670
|
+
*/
|
|
1671
|
+
export function getContextForFile(db: Database, filepath: string): string | null {
|
|
1672
|
+
// Handle undefined or null filepath
|
|
1673
|
+
if (!filepath) return null;
|
|
1674
|
+
|
|
1675
|
+
// Get all collections from YAML config
|
|
1676
|
+
const collections = collectionsListCollections();
|
|
1677
|
+
const config = collectionsLoadConfig();
|
|
1678
|
+
|
|
1679
|
+
// Parse virtual path format: qmd://collection/path
|
|
1680
|
+
let collectionName: string | null = null;
|
|
1681
|
+
let relativePath: string | null = null;
|
|
1682
|
+
|
|
1683
|
+
const parsedVirtual = filepath.startsWith('qmd://') ? parseVirtualPath(filepath) : null;
|
|
1684
|
+
if (parsedVirtual) {
|
|
1685
|
+
collectionName = parsedVirtual.collectionName;
|
|
1686
|
+
relativePath = parsedVirtual.path;
|
|
1687
|
+
} else {
|
|
1688
|
+
// Filesystem path: find which collection this absolute path belongs to
|
|
1689
|
+
for (const coll of collections) {
|
|
1690
|
+
// Skip collections with missing paths
|
|
1691
|
+
if (!coll || !coll.path) continue;
|
|
1692
|
+
|
|
1693
|
+
if (filepath.startsWith(coll.path + '/') || filepath === coll.path) {
|
|
1694
|
+
collectionName = coll.name;
|
|
1695
|
+
// Extract relative path
|
|
1696
|
+
relativePath = filepath.startsWith(coll.path + '/')
|
|
1697
|
+
? filepath.slice(coll.path.length + 1)
|
|
1698
|
+
: '';
|
|
1699
|
+
break;
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
if (!collectionName || relativePath === null) return null;
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
// Get the collection from config
|
|
1707
|
+
const coll = getCollection(collectionName);
|
|
1708
|
+
if (!coll) return null;
|
|
1709
|
+
|
|
1710
|
+
// Verify this document exists in the database
|
|
1711
|
+
const doc = db.prepare(`
|
|
1712
|
+
SELECT d.path
|
|
1713
|
+
FROM documents d
|
|
1714
|
+
WHERE d.collection = ? AND d.path = ? AND d.active = 1
|
|
1715
|
+
LIMIT 1
|
|
1716
|
+
`).get(collectionName, relativePath) as { path: string } | null;
|
|
1717
|
+
|
|
1718
|
+
if (!doc) return null;
|
|
1719
|
+
|
|
1720
|
+
// Collect ALL matching contexts (global + all path prefixes)
|
|
1721
|
+
const contexts: string[] = [];
|
|
1722
|
+
|
|
1723
|
+
// Add global context if present
|
|
1724
|
+
if (config.global_context) {
|
|
1725
|
+
contexts.push(config.global_context);
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1728
|
+
// Add all matching path contexts (from most general to most specific)
|
|
1729
|
+
if (coll.context) {
|
|
1730
|
+
const normalizedPath = relativePath.startsWith("/") ? relativePath : `/${relativePath}`;
|
|
1731
|
+
|
|
1732
|
+
// Collect all matching prefixes
|
|
1733
|
+
const matchingContexts: { prefix: string; context: string }[] = [];
|
|
1734
|
+
for (const [prefix, context] of Object.entries(coll.context)) {
|
|
1735
|
+
const normalizedPrefix = prefix.startsWith("/") ? prefix : `/${prefix}`;
|
|
1736
|
+
if (normalizedPath.startsWith(normalizedPrefix)) {
|
|
1737
|
+
matchingContexts.push({ prefix: normalizedPrefix, context });
|
|
1738
|
+
}
|
|
1739
|
+
}
|
|
1740
|
+
|
|
1741
|
+
// Sort by prefix length (shortest/most general first)
|
|
1742
|
+
matchingContexts.sort((a, b) => a.prefix.length - b.prefix.length);
|
|
1743
|
+
|
|
1744
|
+
// Add all matching contexts
|
|
1745
|
+
for (const match of matchingContexts) {
|
|
1746
|
+
contexts.push(match.context);
|
|
1747
|
+
}
|
|
1748
|
+
}
|
|
1749
|
+
|
|
1750
|
+
// Join all contexts with double newline
|
|
1751
|
+
return contexts.length > 0 ? contexts.join('\n\n') : null;
|
|
1752
|
+
}
|
|
1753
|
+
|
|
1754
|
+
/**
|
|
1755
|
+
* Get collection by name from YAML config.
|
|
1756
|
+
* Returns collection metadata from ~/.config/qmd/index.yml
|
|
1757
|
+
*/
|
|
1758
|
+
export function getCollectionByName(db: Database, name: string): { name: string; pwd: string; glob_pattern: string } | null {
|
|
1759
|
+
const collection = getCollection(name);
|
|
1760
|
+
if (!collection) return null;
|
|
1761
|
+
|
|
1762
|
+
return {
|
|
1763
|
+
name: collection.name,
|
|
1764
|
+
pwd: collection.path,
|
|
1765
|
+
glob_pattern: collection.pattern,
|
|
1766
|
+
};
|
|
1767
|
+
}
|
|
1768
|
+
|
|
1769
|
+
/**
|
|
1770
|
+
* List all collections with document counts from database.
|
|
1771
|
+
* Merges YAML config with database statistics.
|
|
1772
|
+
*/
|
|
1773
|
+
export function listCollections(db: Database): { name: string; pwd: string; glob_pattern: string; doc_count: number; active_count: number; last_modified: string | null }[] {
|
|
1774
|
+
const collections = collectionsListCollections();
|
|
1775
|
+
|
|
1776
|
+
// Get document counts from database for each collection
|
|
1777
|
+
const result = collections.map(coll => {
|
|
1778
|
+
const stats = db.prepare(`
|
|
1779
|
+
SELECT
|
|
1780
|
+
COUNT(d.id) as doc_count,
|
|
1781
|
+
SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
|
|
1782
|
+
MAX(d.modified_at) as last_modified
|
|
1783
|
+
FROM documents d
|
|
1784
|
+
WHERE d.collection = ?
|
|
1785
|
+
`).get(coll.name) as { doc_count: number; active_count: number; last_modified: string | null } | null;
|
|
1786
|
+
|
|
1787
|
+
return {
|
|
1788
|
+
name: coll.name,
|
|
1789
|
+
pwd: coll.path,
|
|
1790
|
+
glob_pattern: coll.pattern,
|
|
1791
|
+
doc_count: stats?.doc_count || 0,
|
|
1792
|
+
active_count: stats?.active_count || 0,
|
|
1793
|
+
last_modified: stats?.last_modified || null,
|
|
1794
|
+
};
|
|
1795
|
+
});
|
|
1796
|
+
|
|
1797
|
+
return result;
|
|
1798
|
+
}
|
|
1799
|
+
|
|
1800
|
+
/**
|
|
1801
|
+
* Remove a collection and clean up its documents.
|
|
1802
|
+
* Uses collections.ts to remove from YAML config and cleans up database.
|
|
1803
|
+
*/
|
|
1804
|
+
export function removeCollection(db: Database, collectionName: string): { deletedDocs: number; cleanedHashes: number } {
|
|
1805
|
+
// Delete documents from database
|
|
1806
|
+
const docResult = db.prepare(`DELETE FROM documents WHERE collection = ?`).run(collectionName);
|
|
1807
|
+
|
|
1808
|
+
// Clean up orphaned content hashes
|
|
1809
|
+
const cleanupResult = db.prepare(`
|
|
1810
|
+
DELETE FROM content
|
|
1811
|
+
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
|
|
1812
|
+
`).run();
|
|
1813
|
+
|
|
1814
|
+
// Remove from YAML config (returns true if found and removed)
|
|
1815
|
+
collectionsRemoveCollection(collectionName);
|
|
1816
|
+
|
|
1817
|
+
return {
|
|
1818
|
+
deletedDocs: docResult.changes,
|
|
1819
|
+
cleanedHashes: cleanupResult.changes
|
|
1820
|
+
};
|
|
1821
|
+
}
|
|
1822
|
+
|
|
1823
|
+
/**
|
|
1824
|
+
* Rename a collection.
|
|
1825
|
+
* Updates both YAML config and database documents table.
|
|
1826
|
+
*/
|
|
1827
|
+
export function renameCollection(db: Database, oldName: string, newName: string): void {
|
|
1828
|
+
// Update all documents with the new collection name in database
|
|
1829
|
+
db.prepare(`UPDATE documents SET collection = ? WHERE collection = ?`)
|
|
1830
|
+
.run(newName, oldName);
|
|
1831
|
+
|
|
1832
|
+
// Rename in YAML config
|
|
1833
|
+
collectionsRenameCollection(oldName, newName);
|
|
1834
|
+
}
|
|
1835
|
+
|
|
1836
|
+
// =============================================================================
|
|
1837
|
+
// Context Management Operations
|
|
1838
|
+
// =============================================================================
|
|
1839
|
+
|
|
1840
|
+
/**
|
|
1841
|
+
* Insert or update a context for a specific collection and path prefix.
|
|
1842
|
+
*/
|
|
1843
|
+
export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
|
|
1844
|
+
// Get collection name from ID
|
|
1845
|
+
const coll = db.prepare(`SELECT name FROM collections WHERE id = ?`).get(collectionId) as { name: string } | null;
|
|
1846
|
+
if (!coll) {
|
|
1847
|
+
throw new Error(`Collection with id ${collectionId} not found`);
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1850
|
+
// Use collections.ts to add context
|
|
1851
|
+
collectionsAddContext(coll.name, pathPrefix, context);
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
/**
|
|
1855
|
+
* Delete a context for a specific collection and path prefix.
|
|
1856
|
+
* Returns the number of contexts deleted.
|
|
1857
|
+
*/
|
|
1858
|
+
export function deleteContext(db: Database, collectionName: string, pathPrefix: string): number {
|
|
1859
|
+
// Use collections.ts to remove context
|
|
1860
|
+
const success = collectionsRemoveContext(collectionName, pathPrefix);
|
|
1861
|
+
return success ? 1 : 0;
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
/**
|
|
1865
|
+
* Delete all global contexts (contexts with empty path_prefix).
|
|
1866
|
+
* Returns the number of contexts deleted.
|
|
1867
|
+
*/
|
|
1868
|
+
export function deleteGlobalContexts(db: Database): number {
|
|
1869
|
+
let deletedCount = 0;
|
|
1870
|
+
|
|
1871
|
+
// Remove global context
|
|
1872
|
+
setGlobalContext(undefined);
|
|
1873
|
+
deletedCount++;
|
|
1874
|
+
|
|
1875
|
+
// Remove root context (empty string) from all collections
|
|
1876
|
+
const collections = collectionsListCollections();
|
|
1877
|
+
for (const coll of collections) {
|
|
1878
|
+
const success = collectionsRemoveContext(coll.name, '');
|
|
1879
|
+
if (success) {
|
|
1880
|
+
deletedCount++;
|
|
1881
|
+
}
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
return deletedCount;
|
|
1885
|
+
}
|
|
1886
|
+
|
|
1887
|
+
/**
|
|
1888
|
+
* List all contexts, grouped by collection.
|
|
1889
|
+
* Returns contexts ordered by collection name, then by path prefix length (longest first).
|
|
1890
|
+
*/
|
|
1891
|
+
export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
|
|
1892
|
+
const allContexts = collectionsListAllContexts();
|
|
1893
|
+
|
|
1894
|
+
// Convert to expected format and sort
|
|
1895
|
+
return allContexts.map(ctx => ({
|
|
1896
|
+
collection_name: ctx.collection,
|
|
1897
|
+
path_prefix: ctx.path,
|
|
1898
|
+
context: ctx.context,
|
|
1899
|
+
})).sort((a, b) => {
|
|
1900
|
+
// Sort by collection name first
|
|
1901
|
+
if (a.collection_name !== b.collection_name) {
|
|
1902
|
+
return a.collection_name.localeCompare(b.collection_name);
|
|
1903
|
+
}
|
|
1904
|
+
// Then by path prefix length (longest first)
|
|
1905
|
+
if (a.path_prefix.length !== b.path_prefix.length) {
|
|
1906
|
+
return b.path_prefix.length - a.path_prefix.length;
|
|
1907
|
+
}
|
|
1908
|
+
// Then alphabetically
|
|
1909
|
+
return a.path_prefix.localeCompare(b.path_prefix);
|
|
1910
|
+
});
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
/**
|
|
1914
|
+
* Get all collections (name only - from YAML config).
|
|
1915
|
+
*/
|
|
1916
|
+
export function getAllCollections(db: Database): { name: string }[] {
|
|
1917
|
+
const collections = collectionsListCollections();
|
|
1918
|
+
return collections.map(c => ({ name: c.name }));
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
/**
|
|
1922
|
+
* Check which collections don't have any context defined.
|
|
1923
|
+
* Returns collections that have no context entries at all (not even root context).
|
|
1924
|
+
*/
|
|
1925
|
+
export function getCollectionsWithoutContext(db: Database): { name: string; pwd: string; doc_count: number }[] {
|
|
1926
|
+
// Get all collections from YAML config
|
|
1927
|
+
const yamlCollections = collectionsListCollections();
|
|
1928
|
+
|
|
1929
|
+
// Filter to those without context
|
|
1930
|
+
const collectionsWithoutContext: { name: string; pwd: string; doc_count: number }[] = [];
|
|
1931
|
+
|
|
1932
|
+
for (const coll of yamlCollections) {
|
|
1933
|
+
// Check if collection has any context
|
|
1934
|
+
if (!coll.context || Object.keys(coll.context).length === 0) {
|
|
1935
|
+
// Get doc count from database
|
|
1936
|
+
const stats = db.prepare(`
|
|
1937
|
+
SELECT COUNT(d.id) as doc_count
|
|
1938
|
+
FROM documents d
|
|
1939
|
+
WHERE d.collection = ? AND d.active = 1
|
|
1940
|
+
`).get(coll.name) as { doc_count: number } | null;
|
|
1941
|
+
|
|
1942
|
+
collectionsWithoutContext.push({
|
|
1943
|
+
name: coll.name,
|
|
1944
|
+
pwd: coll.path,
|
|
1945
|
+
doc_count: stats?.doc_count || 0,
|
|
1946
|
+
});
|
|
1947
|
+
}
|
|
1948
|
+
}
|
|
1949
|
+
|
|
1950
|
+
return collectionsWithoutContext.sort((a, b) => a.name.localeCompare(b.name));
|
|
1951
|
+
}
|
|
1952
|
+
|
|
1953
|
+
/**
|
|
1954
|
+
* Get top-level directories in a collection that don't have context.
|
|
1955
|
+
* Useful for suggesting where context might be needed.
|
|
1956
|
+
*/
|
|
1957
|
+
export function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[] {
|
|
1958
|
+
// Get all paths in the collection from database
|
|
1959
|
+
const paths = db.prepare(`
|
|
1960
|
+
SELECT DISTINCT path FROM documents
|
|
1961
|
+
WHERE collection = ? AND active = 1
|
|
1962
|
+
`).all(collectionName) as { path: string }[];
|
|
1963
|
+
|
|
1964
|
+
// Get existing contexts for this collection from YAML
|
|
1965
|
+
const yamlColl = getCollection(collectionName);
|
|
1966
|
+
if (!yamlColl) return [];
|
|
1967
|
+
|
|
1968
|
+
const contextPrefixes = new Set<string>();
|
|
1969
|
+
if (yamlColl.context) {
|
|
1970
|
+
for (const prefix of Object.keys(yamlColl.context)) {
|
|
1971
|
+
contextPrefixes.add(prefix);
|
|
1972
|
+
}
|
|
1973
|
+
}
|
|
1974
|
+
|
|
1975
|
+
// Extract top-level directories (first path component)
|
|
1976
|
+
const topLevelDirs = new Set<string>();
|
|
1977
|
+
for (const { path } of paths) {
|
|
1978
|
+
const parts = path.split('/').filter(Boolean);
|
|
1979
|
+
if (parts.length > 1) {
|
|
1980
|
+
const dir = parts[0];
|
|
1981
|
+
if (dir) topLevelDirs.add(dir);
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
// Filter out directories that already have context (exact or parent)
|
|
1986
|
+
const missing: string[] = [];
|
|
1987
|
+
for (const dir of topLevelDirs) {
|
|
1988
|
+
let hasContext = false;
|
|
1989
|
+
|
|
1990
|
+
// Check if this dir or any parent has context
|
|
1991
|
+
for (const prefix of contextPrefixes) {
|
|
1992
|
+
if (prefix === '' || prefix === dir || dir.startsWith(prefix + '/')) {
|
|
1993
|
+
hasContext = true;
|
|
1994
|
+
break;
|
|
1995
|
+
}
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
if (!hasContext) {
|
|
1999
|
+
missing.push(dir);
|
|
2000
|
+
}
|
|
2001
|
+
}
|
|
2002
|
+
|
|
2003
|
+
return missing.sort();
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
// =============================================================================
|
|
2007
|
+
// FTS Search
|
|
2008
|
+
// =============================================================================
|
|
2009
|
+
|
|
2010
|
+
function sanitizeFTS5Term(term: string): string {
|
|
2011
|
+
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
|
2012
|
+
}
|
|
2013
|
+
|
|
2014
|
+
function buildFTS5Query(query: string): string | null {
|
|
2015
|
+
const terms = query.split(/\s+/)
|
|
2016
|
+
.map(t => sanitizeFTS5Term(t))
|
|
2017
|
+
.filter(t => t.length > 0);
|
|
2018
|
+
if (terms.length === 0) return null;
|
|
2019
|
+
if (terms.length === 1) return `"${terms[0]}"*`;
|
|
2020
|
+
return terms.map(t => `"${t}"*`).join(' AND ');
|
|
2021
|
+
}
|
|
2022
|
+
|
|
2023
|
+
export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
|
|
2024
|
+
const ftsQuery = buildFTS5Query(query);
|
|
2025
|
+
if (!ftsQuery) return [];
|
|
2026
|
+
|
|
2027
|
+
let sql = `
|
|
2028
|
+
SELECT
|
|
2029
|
+
'qmd://' || d.collection || '/' || d.path as filepath,
|
|
2030
|
+
d.collection || '/' || d.path as display_path,
|
|
2031
|
+
d.title,
|
|
2032
|
+
content.doc as body,
|
|
2033
|
+
d.hash,
|
|
2034
|
+
bm25(documents_fts, 10.0, 1.0) as bm25_score
|
|
2035
|
+
FROM documents_fts f
|
|
2036
|
+
JOIN documents d ON d.id = f.rowid
|
|
2037
|
+
JOIN content ON content.hash = d.hash
|
|
2038
|
+
WHERE documents_fts MATCH ? AND d.active = 1
|
|
2039
|
+
`;
|
|
2040
|
+
const params: (string | number)[] = [ftsQuery];
|
|
2041
|
+
|
|
2042
|
+
if (collectionId) {
|
|
2043
|
+
// Note: collectionId is a legacy parameter that should be phased out
|
|
2044
|
+
// Collections are now managed in YAML. For now, we interpret it as a collection name filter.
|
|
2045
|
+
// This code path is likely unused as collection filtering should be done at CLI level.
|
|
2046
|
+
sql += ` AND d.collection = ?`;
|
|
2047
|
+
params.push(String(collectionId));
|
|
2048
|
+
}
|
|
2049
|
+
|
|
2050
|
+
// bm25 lower is better; sort ascending.
|
|
2051
|
+
sql += ` ORDER BY bm25_score ASC LIMIT ?`;
|
|
2052
|
+
params.push(limit);
|
|
2053
|
+
|
|
2054
|
+
const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; bm25_score: number }[];
|
|
2055
|
+
return rows.map(row => {
|
|
2056
|
+
const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
|
|
2057
|
+
// Convert bm25 (negative, lower is better) into a stable [0..1) score where higher is better.
|
|
2058
|
+
// FTS5 BM25 scores are negative (e.g., -10 is strong, -2 is weak).
|
|
2059
|
+
// |x| / (1 + |x|) maps: strong(-10)→0.91, medium(-2)→0.67, weak(-0.5)→0.33, none(0)→0.
|
|
2060
|
+
// Monotonic and query-independent — no per-query normalization needed.
|
|
2061
|
+
const score = Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score));
|
|
2062
|
+
return {
|
|
2063
|
+
filepath: row.filepath,
|
|
2064
|
+
displayPath: row.display_path,
|
|
2065
|
+
title: row.title,
|
|
2066
|
+
hash: row.hash,
|
|
2067
|
+
docid: getDocid(row.hash),
|
|
2068
|
+
collectionName,
|
|
2069
|
+
modifiedAt: "", // Not available in FTS query
|
|
2070
|
+
bodyLength: row.body.length,
|
|
2071
|
+
body: row.body,
|
|
2072
|
+
context: getContextForFile(db, row.filepath),
|
|
2073
|
+
score,
|
|
2074
|
+
source: "fts" as const,
|
|
2075
|
+
};
|
|
2076
|
+
});
|
|
2077
|
+
}
|
|
2078
|
+
|
|
2079
|
+
// =============================================================================
|
|
2080
|
+
// Vector Search
|
|
2081
|
+
// =============================================================================
|
|
2082
|
+
|
|
2083
|
+
export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise<SearchResult[]> {
|
|
2084
|
+
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
2085
|
+
if (!tableExists) return [];
|
|
2086
|
+
|
|
2087
|
+
const embedding = await getEmbedding(query, model, true, session);
|
|
2088
|
+
if (!embedding) return [];
|
|
2089
|
+
|
|
2090
|
+
// IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
|
|
2091
|
+
// hang indefinitely when combined with JOINs in the same query. Do NOT try to
|
|
2092
|
+
// "optimize" this by combining into a single query with JOINs - it will break.
|
|
2093
|
+
// See: https://github.com/tobi/qmd/pull/23
|
|
2094
|
+
|
|
2095
|
+
// Step 1: Get vector matches from sqlite-vec (no JOINs allowed)
|
|
2096
|
+
const vecResults = db.prepare(`
|
|
2097
|
+
SELECT hash_seq, distance
|
|
2098
|
+
FROM vectors_vec
|
|
2099
|
+
WHERE embedding MATCH ? AND k = ?
|
|
2100
|
+
`).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number }[];
|
|
2101
|
+
|
|
2102
|
+
if (vecResults.length === 0) return [];
|
|
2103
|
+
|
|
2104
|
+
// Step 2: Get chunk info and document data
|
|
2105
|
+
const hashSeqs = vecResults.map(r => r.hash_seq);
|
|
2106
|
+
const distanceMap = new Map(vecResults.map(r => [r.hash_seq, r.distance]));
|
|
2107
|
+
|
|
2108
|
+
// Build query for document lookup
|
|
2109
|
+
const placeholders = hashSeqs.map(() => '?').join(',');
|
|
2110
|
+
let docSql = `
|
|
2111
|
+
SELECT
|
|
2112
|
+
cv.hash || '_' || cv.seq as hash_seq,
|
|
2113
|
+
cv.hash,
|
|
2114
|
+
cv.pos,
|
|
2115
|
+
'qmd://' || d.collection || '/' || d.path as filepath,
|
|
2116
|
+
d.collection || '/' || d.path as display_path,
|
|
2117
|
+
d.title,
|
|
2118
|
+
content.doc as body
|
|
2119
|
+
FROM content_vectors cv
|
|
2120
|
+
JOIN documents d ON d.hash = cv.hash AND d.active = 1
|
|
2121
|
+
JOIN content ON content.hash = d.hash
|
|
2122
|
+
WHERE cv.hash || '_' || cv.seq IN (${placeholders})
|
|
2123
|
+
`;
|
|
2124
|
+
const params: string[] = [...hashSeqs];
|
|
2125
|
+
|
|
2126
|
+
if (collectionName) {
|
|
2127
|
+
docSql += ` AND d.collection = ?`;
|
|
2128
|
+
params.push(collectionName);
|
|
2129
|
+
}
|
|
2130
|
+
|
|
2131
|
+
const docRows = db.prepare(docSql).all(...params) as {
|
|
2132
|
+
hash_seq: string; hash: string; pos: number; filepath: string;
|
|
2133
|
+
display_path: string; title: string; body: string;
|
|
2134
|
+
}[];
|
|
2135
|
+
|
|
2136
|
+
// Combine with distances and dedupe by filepath
|
|
2137
|
+
const seen = new Map<string, { row: typeof docRows[0]; bestDist: number }>();
|
|
2138
|
+
for (const row of docRows) {
|
|
2139
|
+
const distance = distanceMap.get(row.hash_seq) ?? 1;
|
|
2140
|
+
const existing = seen.get(row.filepath);
|
|
2141
|
+
if (!existing || distance < existing.bestDist) {
|
|
2142
|
+
seen.set(row.filepath, { row, bestDist: distance });
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
|
|
2146
|
+
return Array.from(seen.values())
|
|
2147
|
+
.sort((a, b) => a.bestDist - b.bestDist)
|
|
2148
|
+
.slice(0, limit)
|
|
2149
|
+
.map(({ row, bestDist }) => {
|
|
2150
|
+
const collectionName = row.filepath.split('//')[1]?.split('/')[0] || "";
|
|
2151
|
+
return {
|
|
2152
|
+
filepath: row.filepath,
|
|
2153
|
+
displayPath: row.display_path,
|
|
2154
|
+
title: row.title,
|
|
2155
|
+
hash: row.hash,
|
|
2156
|
+
docid: getDocid(row.hash),
|
|
2157
|
+
collectionName,
|
|
2158
|
+
modifiedAt: "", // Not available in vec query
|
|
2159
|
+
bodyLength: row.body.length,
|
|
2160
|
+
body: row.body,
|
|
2161
|
+
context: getContextForFile(db, row.filepath),
|
|
2162
|
+
score: 1 - bestDist, // Cosine similarity = 1 - cosine distance
|
|
2163
|
+
source: "vec" as const,
|
|
2164
|
+
chunkPos: row.pos,
|
|
2165
|
+
};
|
|
2166
|
+
});
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
// =============================================================================
|
|
2170
|
+
// Embeddings
|
|
2171
|
+
// =============================================================================
|
|
2172
|
+
|
|
2173
|
+
async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession): Promise<number[] | null> {
|
|
2174
|
+
// Format text using the appropriate prompt template
|
|
2175
|
+
const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
|
|
2176
|
+
const result = session
|
|
2177
|
+
? await session.embed(formattedText, { model, isQuery })
|
|
2178
|
+
: await getDefaultLlamaCpp().embed(formattedText, { model, isQuery });
|
|
2179
|
+
return result?.embedding || null;
|
|
2180
|
+
}
|
|
2181
|
+
|
|
2182
|
+
/**
|
|
2183
|
+
* Get all unique content hashes that need embeddings (from active documents).
|
|
2184
|
+
* Returns hash, document body, and a sample path for display purposes.
|
|
2185
|
+
*/
|
|
2186
|
+
export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
|
|
2187
|
+
return db.prepare(`
|
|
2188
|
+
SELECT d.hash, c.doc as body, MIN(d.path) as path
|
|
2189
|
+
FROM documents d
|
|
2190
|
+
JOIN content c ON d.hash = c.hash
|
|
2191
|
+
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
|
2192
|
+
WHERE d.active = 1 AND v.hash IS NULL
|
|
2193
|
+
GROUP BY d.hash
|
|
2194
|
+
`).all() as { hash: string; body: string; path: string }[];
|
|
2195
|
+
}
|
|
2196
|
+
|
|
2197
|
+
/**
|
|
2198
|
+
* Clear all embeddings from the database (force re-index).
|
|
2199
|
+
* Deletes all rows from content_vectors and drops the vectors_vec table.
|
|
2200
|
+
*/
|
|
2201
|
+
export function clearAllEmbeddings(db: Database): void {
|
|
2202
|
+
db.exec(`DELETE FROM content_vectors`);
|
|
2203
|
+
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
|
2204
|
+
}
|
|
2205
|
+
|
|
2206
|
+
/**
|
|
2207
|
+
* Insert a single embedding into both content_vectors and vectors_vec tables.
|
|
2208
|
+
* The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
|
|
2209
|
+
*/
|
|
2210
|
+
export function insertEmbedding(
|
|
2211
|
+
db: Database,
|
|
2212
|
+
hash: string,
|
|
2213
|
+
seq: number,
|
|
2214
|
+
pos: number,
|
|
2215
|
+
embedding: Float32Array,
|
|
2216
|
+
model: string,
|
|
2217
|
+
embeddedAt: string
|
|
2218
|
+
): void {
|
|
2219
|
+
const hashSeq = `${hash}_${seq}`;
|
|
2220
|
+
const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
|
2221
|
+
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
|
|
2222
|
+
|
|
2223
|
+
insertVecStmt.run(hashSeq, embedding);
|
|
2224
|
+
insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
|
|
2225
|
+
}
|
|
2226
|
+
|
|
2227
|
+
// =============================================================================
|
|
2228
|
+
// Query expansion
|
|
2229
|
+
// =============================================================================
|
|
2230
|
+
|
|
2231
|
+
export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database): Promise<ExpandedQuery[]> {
|
|
2232
|
+
// Check cache first — stored as JSON preserving types
|
|
2233
|
+
const cacheKey = getCacheKey("expandQuery", { query, model });
|
|
2234
|
+
const cached = getCachedResult(db, cacheKey);
|
|
2235
|
+
if (cached) {
|
|
2236
|
+
try {
|
|
2237
|
+
return JSON.parse(cached) as ExpandedQuery[];
|
|
2238
|
+
} catch {
|
|
2239
|
+
// Old cache format (pre-typed, newline-separated text) — re-expand
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
|
|
2243
|
+
const llm = getDefaultLlamaCpp();
|
|
2244
|
+
// Note: LlamaCpp uses hardcoded model, model parameter is ignored
|
|
2245
|
+
const results = await llm.expandQuery(query);
|
|
2246
|
+
|
|
2247
|
+
// Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals).
|
|
2248
|
+
// Filter out entries that duplicate the original query text.
|
|
2249
|
+
const expanded: ExpandedQuery[] = results
|
|
2250
|
+
.filter(r => r.text !== query)
|
|
2251
|
+
.map(r => ({ type: r.type, text: r.text }));
|
|
2252
|
+
|
|
2253
|
+
if (expanded.length > 0) {
|
|
2254
|
+
setCachedResult(db, cacheKey, JSON.stringify(expanded));
|
|
2255
|
+
}
|
|
2256
|
+
|
|
2257
|
+
return expanded;
|
|
2258
|
+
}
|
|
2259
|
+
|
|
2260
|
+
// =============================================================================
|
|
2261
|
+
// Reranking
|
|
2262
|
+
// =============================================================================
|
|
2263
|
+
|
|
2264
|
+
export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database): Promise<{ file: string; score: number }[]> {
|
|
2265
|
+
const cachedResults: Map<string, number> = new Map();
|
|
2266
|
+
const uncachedDocs: RerankDocument[] = [];
|
|
2267
|
+
|
|
2268
|
+
// Check cache for each document
|
|
2269
|
+
// Cache key includes chunk text — different queries can select different chunks
|
|
2270
|
+
// from the same file, and the reranker score depends on which chunk was sent.
|
|
2271
|
+
for (const doc of documents) {
|
|
2272
|
+
const cacheKey = getCacheKey("rerank", { query, file: doc.file, model, chunk: doc.text });
|
|
2273
|
+
const cached = getCachedResult(db, cacheKey);
|
|
2274
|
+
if (cached !== null) {
|
|
2275
|
+
cachedResults.set(doc.file, parseFloat(cached));
|
|
2276
|
+
} else {
|
|
2277
|
+
uncachedDocs.push({ file: doc.file, text: doc.text });
|
|
2278
|
+
}
|
|
2279
|
+
}
|
|
2280
|
+
|
|
2281
|
+
// Rerank uncached documents using LlamaCpp
|
|
2282
|
+
if (uncachedDocs.length > 0) {
|
|
2283
|
+
const llm = getDefaultLlamaCpp();
|
|
2284
|
+
const rerankResult = await llm.rerank(query, uncachedDocs, { model });
|
|
2285
|
+
|
|
2286
|
+
// Cache results — use original doc.text for cache key (result.file lacks chunk text)
|
|
2287
|
+
const textByFile = new Map(documents.map(d => [d.file, d.text]));
|
|
2288
|
+
for (const result of rerankResult.results) {
|
|
2289
|
+
const cacheKey = getCacheKey("rerank", { query, file: result.file, model, chunk: textByFile.get(result.file) || "" });
|
|
2290
|
+
setCachedResult(db, cacheKey, result.score.toString());
|
|
2291
|
+
cachedResults.set(result.file, result.score);
|
|
2292
|
+
}
|
|
2293
|
+
}
|
|
2294
|
+
|
|
2295
|
+
// Return all results sorted by score
|
|
2296
|
+
return documents
|
|
2297
|
+
.map(doc => ({ file: doc.file, score: cachedResults.get(doc.file) || 0 }))
|
|
2298
|
+
.sort((a, b) => b.score - a.score);
|
|
2299
|
+
}
|
|
2300
|
+
|
|
2301
|
+
// =============================================================================
|
|
2302
|
+
// Reciprocal Rank Fusion
|
|
2303
|
+
// =============================================================================
|
|
2304
|
+
|
|
2305
|
+
export function reciprocalRankFusion(
|
|
2306
|
+
resultLists: RankedResult[][],
|
|
2307
|
+
weights: number[] = [],
|
|
2308
|
+
k: number = 60
|
|
2309
|
+
): RankedResult[] {
|
|
2310
|
+
const scores = new Map<string, { result: RankedResult; rrfScore: number; topRank: number }>();
|
|
2311
|
+
|
|
2312
|
+
for (let listIdx = 0; listIdx < resultLists.length; listIdx++) {
|
|
2313
|
+
const list = resultLists[listIdx];
|
|
2314
|
+
if (!list) continue;
|
|
2315
|
+
const weight = weights[listIdx] ?? 1.0;
|
|
2316
|
+
|
|
2317
|
+
for (let rank = 0; rank < list.length; rank++) {
|
|
2318
|
+
const result = list[rank];
|
|
2319
|
+
if (!result) continue;
|
|
2320
|
+
const rrfContribution = weight / (k + rank + 1);
|
|
2321
|
+
const existing = scores.get(result.file);
|
|
2322
|
+
|
|
2323
|
+
if (existing) {
|
|
2324
|
+
existing.rrfScore += rrfContribution;
|
|
2325
|
+
existing.topRank = Math.min(existing.topRank, rank);
|
|
2326
|
+
} else {
|
|
2327
|
+
scores.set(result.file, {
|
|
2328
|
+
result,
|
|
2329
|
+
rrfScore: rrfContribution,
|
|
2330
|
+
topRank: rank,
|
|
2331
|
+
});
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
}
|
|
2335
|
+
|
|
2336
|
+
// Top-rank bonus
|
|
2337
|
+
for (const entry of scores.values()) {
|
|
2338
|
+
if (entry.topRank === 0) {
|
|
2339
|
+
entry.rrfScore += 0.05;
|
|
2340
|
+
} else if (entry.topRank <= 2) {
|
|
2341
|
+
entry.rrfScore += 0.02;
|
|
2342
|
+
}
|
|
2343
|
+
}
|
|
2344
|
+
|
|
2345
|
+
return Array.from(scores.values())
|
|
2346
|
+
.sort((a, b) => b.rrfScore - a.rrfScore)
|
|
2347
|
+
.map(e => ({ ...e.result, score: e.rrfScore }));
|
|
2348
|
+
}
|
|
2349
|
+
|
|
2350
|
+
// =============================================================================
|
|
2351
|
+
// Document retrieval
|
|
2352
|
+
// =============================================================================
|
|
2353
|
+
|
|
2354
|
+
type DbDocRow = {
|
|
2355
|
+
virtual_path: string;
|
|
2356
|
+
display_path: string;
|
|
2357
|
+
title: string;
|
|
2358
|
+
hash: string;
|
|
2359
|
+
collection: string;
|
|
2360
|
+
path: string;
|
|
2361
|
+
modified_at: string;
|
|
2362
|
+
body_length: number;
|
|
2363
|
+
body?: string;
|
|
2364
|
+
};
|
|
2365
|
+
|
|
2366
|
+
/**
|
|
2367
|
+
* Find a document by filename/path, docid (#hash), or with fuzzy matching.
|
|
2368
|
+
* Returns document metadata without body by default.
|
|
2369
|
+
*
|
|
2370
|
+
* Supports:
|
|
2371
|
+
* - Virtual paths: qmd://collection/path/to/file.md
|
|
2372
|
+
* - Absolute paths: /path/to/file.md
|
|
2373
|
+
* - Relative paths: path/to/file.md
|
|
2374
|
+
* - Short docid: #abc123 (first 6 chars of hash)
|
|
2375
|
+
*/
|
|
2376
|
+
export function findDocument(db: Database, filename: string, options: { includeBody?: boolean } = {}): DocumentResult | DocumentNotFound {
|
|
2377
|
+
let filepath = filename;
|
|
2378
|
+
const colonMatch = filepath.match(/:(\d+)$/);
|
|
2379
|
+
if (colonMatch) {
|
|
2380
|
+
filepath = filepath.slice(0, -colonMatch[0].length);
|
|
2381
|
+
}
|
|
2382
|
+
|
|
2383
|
+
// Check if this is a docid lookup (#abc123, abc123, "#abc123", "abc123", etc.)
|
|
2384
|
+
if (isDocid(filepath)) {
|
|
2385
|
+
const docidMatch = findDocumentByDocid(db, filepath);
|
|
2386
|
+
if (docidMatch) {
|
|
2387
|
+
filepath = docidMatch.filepath;
|
|
2388
|
+
} else {
|
|
2389
|
+
return { error: "not_found", query: filename, similarFiles: [] };
|
|
2390
|
+
}
|
|
2391
|
+
}
|
|
2392
|
+
|
|
2393
|
+
if (filepath.startsWith('~/')) {
|
|
2394
|
+
filepath = homedir() + filepath.slice(1);
|
|
2395
|
+
}
|
|
2396
|
+
|
|
2397
|
+
const bodyCol = options.includeBody ? `, content.doc as body` : ``;
|
|
2398
|
+
|
|
2399
|
+
// Build computed columns
|
|
2400
|
+
// Note: absoluteFilepath is computed from YAML collections after query
|
|
2401
|
+
const selectCols = `
|
|
2402
|
+
'qmd://' || d.collection || '/' || d.path as virtual_path,
|
|
2403
|
+
d.collection || '/' || d.path as display_path,
|
|
2404
|
+
d.title,
|
|
2405
|
+
d.hash,
|
|
2406
|
+
d.collection,
|
|
2407
|
+
d.modified_at,
|
|
2408
|
+
LENGTH(content.doc) as body_length
|
|
2409
|
+
${bodyCol}
|
|
2410
|
+
`;
|
|
2411
|
+
|
|
2412
|
+
// Try to match by virtual path first
|
|
2413
|
+
let doc = db.prepare(`
|
|
2414
|
+
SELECT ${selectCols}
|
|
2415
|
+
FROM documents d
|
|
2416
|
+
JOIN content ON content.hash = d.hash
|
|
2417
|
+
WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
|
|
2418
|
+
`).get(filepath) as DbDocRow | null;
|
|
2419
|
+
|
|
2420
|
+
// Try fuzzy match by virtual path
|
|
2421
|
+
if (!doc) {
|
|
2422
|
+
doc = db.prepare(`
|
|
2423
|
+
SELECT ${selectCols}
|
|
2424
|
+
FROM documents d
|
|
2425
|
+
JOIN content ON content.hash = d.hash
|
|
2426
|
+
WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
|
|
2427
|
+
LIMIT 1
|
|
2428
|
+
`).get(`%${filepath}`) as DbDocRow | null;
|
|
2429
|
+
}
|
|
2430
|
+
|
|
2431
|
+
// Try to match by absolute path (requires looking up collection paths from YAML)
|
|
2432
|
+
if (!doc && !filepath.startsWith('qmd://')) {
|
|
2433
|
+
const collections = collectionsListCollections();
|
|
2434
|
+
for (const coll of collections) {
|
|
2435
|
+
let relativePath: string | null = null;
|
|
2436
|
+
|
|
2437
|
+
// If filepath is absolute and starts with collection path, extract relative part
|
|
2438
|
+
if (filepath.startsWith(coll.path + '/')) {
|
|
2439
|
+
relativePath = filepath.slice(coll.path.length + 1);
|
|
2440
|
+
}
|
|
2441
|
+
// Otherwise treat filepath as relative to collection
|
|
2442
|
+
else if (!filepath.startsWith('/')) {
|
|
2443
|
+
relativePath = filepath;
|
|
2444
|
+
}
|
|
2445
|
+
|
|
2446
|
+
if (relativePath) {
|
|
2447
|
+
doc = db.prepare(`
|
|
2448
|
+
SELECT ${selectCols}
|
|
2449
|
+
FROM documents d
|
|
2450
|
+
JOIN content ON content.hash = d.hash
|
|
2451
|
+
WHERE d.collection = ? AND d.path = ? AND d.active = 1
|
|
2452
|
+
`).get(coll.name, relativePath) as DbDocRow | null;
|
|
2453
|
+
if (doc) break;
|
|
2454
|
+
}
|
|
2455
|
+
}
|
|
2456
|
+
}
|
|
2457
|
+
|
|
2458
|
+
if (!doc) {
|
|
2459
|
+
const similar = findSimilarFiles(db, filepath, 5, 5);
|
|
2460
|
+
return { error: "not_found", query: filename, similarFiles: similar };
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
// Get context using virtual path
|
|
2464
|
+
const virtualPath = doc.virtual_path || `qmd://${doc.collection}/${doc.display_path}`;
|
|
2465
|
+
const context = getContextForFile(db, virtualPath);
|
|
2466
|
+
|
|
2467
|
+
return {
|
|
2468
|
+
filepath: virtualPath,
|
|
2469
|
+
displayPath: doc.display_path,
|
|
2470
|
+
title: doc.title,
|
|
2471
|
+
context,
|
|
2472
|
+
hash: doc.hash,
|
|
2473
|
+
docid: getDocid(doc.hash),
|
|
2474
|
+
collectionName: doc.collection,
|
|
2475
|
+
modifiedAt: doc.modified_at,
|
|
2476
|
+
bodyLength: doc.body_length,
|
|
2477
|
+
...(options.includeBody && doc.body !== undefined && { body: doc.body }),
|
|
2478
|
+
};
|
|
2479
|
+
}
|
|
2480
|
+
|
|
2481
|
+
/**
|
|
2482
|
+
* Get the body content for a document
|
|
2483
|
+
* Optionally slice by line range
|
|
2484
|
+
*/
|
|
2485
|
+
export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: string }, fromLine?: number, maxLines?: number): string | null {
|
|
2486
|
+
const filepath = doc.filepath;
|
|
2487
|
+
|
|
2488
|
+
// Try to resolve document by filepath (absolute or virtual)
|
|
2489
|
+
let row: { body: string } | null = null;
|
|
2490
|
+
|
|
2491
|
+
// Try virtual path first
|
|
2492
|
+
if (filepath.startsWith('qmd://')) {
|
|
2493
|
+
row = db.prepare(`
|
|
2494
|
+
SELECT content.doc as body
|
|
2495
|
+
FROM documents d
|
|
2496
|
+
JOIN content ON content.hash = d.hash
|
|
2497
|
+
WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
|
|
2498
|
+
`).get(filepath) as { body: string } | null;
|
|
2499
|
+
}
|
|
2500
|
+
|
|
2501
|
+
// Try absolute path by looking up in YAML collections
|
|
2502
|
+
if (!row) {
|
|
2503
|
+
const collections = collectionsListCollections();
|
|
2504
|
+
for (const coll of collections) {
|
|
2505
|
+
if (filepath.startsWith(coll.path + '/')) {
|
|
2506
|
+
const relativePath = filepath.slice(coll.path.length + 1);
|
|
2507
|
+
row = db.prepare(`
|
|
2508
|
+
SELECT content.doc as body
|
|
2509
|
+
FROM documents d
|
|
2510
|
+
JOIN content ON content.hash = d.hash
|
|
2511
|
+
WHERE d.collection = ? AND d.path = ? AND d.active = 1
|
|
2512
|
+
`).get(coll.name, relativePath) as { body: string } | null;
|
|
2513
|
+
if (row) break;
|
|
2514
|
+
}
|
|
2515
|
+
}
|
|
2516
|
+
}
|
|
2517
|
+
|
|
2518
|
+
if (!row) return null;
|
|
2519
|
+
|
|
2520
|
+
let body = row.body;
|
|
2521
|
+
if (fromLine !== undefined || maxLines !== undefined) {
|
|
2522
|
+
const lines = body.split('\n');
|
|
2523
|
+
const start = (fromLine || 1) - 1;
|
|
2524
|
+
const end = maxLines !== undefined ? start + maxLines : lines.length;
|
|
2525
|
+
body = lines.slice(start, end).join('\n');
|
|
2526
|
+
}
|
|
2527
|
+
|
|
2528
|
+
return body;
|
|
2529
|
+
}
|
|
2530
|
+
|
|
2531
|
+
/**
|
|
2532
|
+
* Find multiple documents by glob pattern or comma-separated list
|
|
2533
|
+
* Returns documents without body by default (use getDocumentBody to load)
|
|
2534
|
+
*/
|
|
2535
|
+
export function findDocuments(
|
|
2536
|
+
db: Database,
|
|
2537
|
+
pattern: string,
|
|
2538
|
+
options: { includeBody?: boolean; maxBytes?: number } = {}
|
|
2539
|
+
): { docs: MultiGetResult[]; errors: string[] } {
|
|
2540
|
+
const isCommaSeparated = pattern.includes(',') && !pattern.includes('*') && !pattern.includes('?');
|
|
2541
|
+
const errors: string[] = [];
|
|
2542
|
+
const maxBytes = options.maxBytes ?? DEFAULT_MULTI_GET_MAX_BYTES;
|
|
2543
|
+
|
|
2544
|
+
const bodyCol = options.includeBody ? `, content.doc as body` : ``;
|
|
2545
|
+
const selectCols = `
|
|
2546
|
+
'qmd://' || d.collection || '/' || d.path as virtual_path,
|
|
2547
|
+
d.collection || '/' || d.path as display_path,
|
|
2548
|
+
d.title,
|
|
2549
|
+
d.hash,
|
|
2550
|
+
d.collection,
|
|
2551
|
+
d.modified_at,
|
|
2552
|
+
LENGTH(content.doc) as body_length
|
|
2553
|
+
${bodyCol}
|
|
2554
|
+
`;
|
|
2555
|
+
|
|
2556
|
+
let fileRows: DbDocRow[];
|
|
2557
|
+
|
|
2558
|
+
if (isCommaSeparated) {
|
|
2559
|
+
const names = pattern.split(',').map(s => s.trim()).filter(Boolean);
|
|
2560
|
+
fileRows = [];
|
|
2561
|
+
for (const name of names) {
|
|
2562
|
+
let doc = db.prepare(`
|
|
2563
|
+
SELECT ${selectCols}
|
|
2564
|
+
FROM documents d
|
|
2565
|
+
JOIN content ON content.hash = d.hash
|
|
2566
|
+
WHERE 'qmd://' || d.collection || '/' || d.path = ? AND d.active = 1
|
|
2567
|
+
`).get(name) as DbDocRow | null;
|
|
2568
|
+
if (!doc) {
|
|
2569
|
+
doc = db.prepare(`
|
|
2570
|
+
SELECT ${selectCols}
|
|
2571
|
+
FROM documents d
|
|
2572
|
+
JOIN content ON content.hash = d.hash
|
|
2573
|
+
WHERE 'qmd://' || d.collection || '/' || d.path LIKE ? AND d.active = 1
|
|
2574
|
+
LIMIT 1
|
|
2575
|
+
`).get(`%${name}`) as DbDocRow | null;
|
|
2576
|
+
}
|
|
2577
|
+
if (doc) {
|
|
2578
|
+
fileRows.push(doc);
|
|
2579
|
+
} else {
|
|
2580
|
+
const similar = findSimilarFiles(db, name, 5, 3);
|
|
2581
|
+
let msg = `File not found: ${name}`;
|
|
2582
|
+
if (similar.length > 0) {
|
|
2583
|
+
msg += ` (did you mean: ${similar.join(', ')}?)`;
|
|
2584
|
+
}
|
|
2585
|
+
errors.push(msg);
|
|
2586
|
+
}
|
|
2587
|
+
}
|
|
2588
|
+
} else {
|
|
2589
|
+
// Glob pattern match
|
|
2590
|
+
const matched = matchFilesByGlob(db, pattern);
|
|
2591
|
+
if (matched.length === 0) {
|
|
2592
|
+
errors.push(`No files matched pattern: ${pattern}`);
|
|
2593
|
+
return { docs: [], errors };
|
|
2594
|
+
}
|
|
2595
|
+
const virtualPaths = matched.map(m => m.filepath);
|
|
2596
|
+
const placeholders = virtualPaths.map(() => '?').join(',');
|
|
2597
|
+
fileRows = db.prepare(`
|
|
2598
|
+
SELECT ${selectCols}
|
|
2599
|
+
FROM documents d
|
|
2600
|
+
JOIN content ON content.hash = d.hash
|
|
2601
|
+
WHERE 'qmd://' || d.collection || '/' || d.path IN (${placeholders}) AND d.active = 1
|
|
2602
|
+
`).all(...virtualPaths) as DbDocRow[];
|
|
2603
|
+
}
|
|
2604
|
+
|
|
2605
|
+
const results: MultiGetResult[] = [];
|
|
2606
|
+
|
|
2607
|
+
for (const row of fileRows) {
|
|
2608
|
+
// Get context using virtual path
|
|
2609
|
+
const virtualPath = row.virtual_path || `qmd://${row.collection}/${row.display_path}`;
|
|
2610
|
+
const context = getContextForFile(db, virtualPath);
|
|
2611
|
+
|
|
2612
|
+
if (row.body_length > maxBytes) {
|
|
2613
|
+
results.push({
|
|
2614
|
+
doc: { filepath: virtualPath, displayPath: row.display_path },
|
|
2615
|
+
skipped: true,
|
|
2616
|
+
skipReason: `File too large (${Math.round(row.body_length / 1024)}KB > ${Math.round(maxBytes / 1024)}KB)`,
|
|
2617
|
+
});
|
|
2618
|
+
continue;
|
|
2619
|
+
}
|
|
2620
|
+
|
|
2621
|
+
results.push({
|
|
2622
|
+
doc: {
|
|
2623
|
+
filepath: virtualPath,
|
|
2624
|
+
displayPath: row.display_path,
|
|
2625
|
+
title: row.title || row.display_path.split('/').pop() || row.display_path,
|
|
2626
|
+
context,
|
|
2627
|
+
hash: row.hash,
|
|
2628
|
+
docid: getDocid(row.hash),
|
|
2629
|
+
collectionName: row.collection,
|
|
2630
|
+
modifiedAt: row.modified_at,
|
|
2631
|
+
bodyLength: row.body_length,
|
|
2632
|
+
...(options.includeBody && row.body !== undefined && { body: row.body }),
|
|
2633
|
+
},
|
|
2634
|
+
skipped: false,
|
|
2635
|
+
});
|
|
2636
|
+
}
|
|
2637
|
+
|
|
2638
|
+
return { docs: results, errors };
|
|
2639
|
+
}
|
|
2640
|
+
|
|
2641
|
+
// =============================================================================
|
|
2642
|
+
// Status
|
|
2643
|
+
// =============================================================================
|
|
2644
|
+
|
|
2645
|
+
export function getStatus(db: Database): IndexStatus {
|
|
2646
|
+
// Load collections from YAML
|
|
2647
|
+
const yamlCollections = collectionsListCollections();
|
|
2648
|
+
|
|
2649
|
+
// Get document counts and last update times for each collection
|
|
2650
|
+
const collections = yamlCollections.map(col => {
|
|
2651
|
+
const stats = db.prepare(`
|
|
2652
|
+
SELECT
|
|
2653
|
+
COUNT(*) as active_count,
|
|
2654
|
+
MAX(modified_at) as last_doc_update
|
|
2655
|
+
FROM documents
|
|
2656
|
+
WHERE collection = ? AND active = 1
|
|
2657
|
+
`).get(col.name) as { active_count: number; last_doc_update: string | null };
|
|
2658
|
+
|
|
2659
|
+
return {
|
|
2660
|
+
name: col.name,
|
|
2661
|
+
path: col.path,
|
|
2662
|
+
pattern: col.pattern,
|
|
2663
|
+
documents: stats.active_count,
|
|
2664
|
+
lastUpdated: stats.last_doc_update || new Date().toISOString(),
|
|
2665
|
+
};
|
|
2666
|
+
});
|
|
2667
|
+
|
|
2668
|
+
// Sort by last update time (most recent first)
|
|
2669
|
+
collections.sort((a, b) => {
|
|
2670
|
+
if (!a.lastUpdated) return 1;
|
|
2671
|
+
if (!b.lastUpdated) return -1;
|
|
2672
|
+
return new Date(b.lastUpdated).getTime() - new Date(a.lastUpdated).getTime();
|
|
2673
|
+
});
|
|
2674
|
+
|
|
2675
|
+
const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
|
|
2676
|
+
const needsEmbedding = getHashesNeedingEmbedding(db);
|
|
2677
|
+
const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
2678
|
+
|
|
2679
|
+
return {
|
|
2680
|
+
totalDocuments: totalDocs,
|
|
2681
|
+
needsEmbedding,
|
|
2682
|
+
hasVectorIndex: hasVectors,
|
|
2683
|
+
collections,
|
|
2684
|
+
};
|
|
2685
|
+
}
|
|
2686
|
+
|
|
2687
|
+
// =============================================================================
|
|
2688
|
+
// Snippet extraction
|
|
2689
|
+
// =============================================================================
|
|
2690
|
+
|
|
2691
|
+
export type SnippetResult = {
|
|
2692
|
+
line: number; // 1-indexed line number of best match
|
|
2693
|
+
snippet: string; // The snippet text with diff-style header
|
|
2694
|
+
linesBefore: number; // Lines in document before snippet
|
|
2695
|
+
linesAfter: number; // Lines in document after snippet
|
|
2696
|
+
snippetLines: number; // Number of lines in snippet
|
|
2697
|
+
};
|
|
2698
|
+
|
|
2699
|
+
export function extractSnippet(body: string, query: string, maxLen = 500, chunkPos?: number, chunkLen?: number): SnippetResult {
|
|
2700
|
+
const totalLines = body.split('\n').length;
|
|
2701
|
+
let searchBody = body;
|
|
2702
|
+
let lineOffset = 0;
|
|
2703
|
+
|
|
2704
|
+
if (chunkPos && chunkPos > 0) {
|
|
2705
|
+
// Search within the chunk region, with some padding for context
|
|
2706
|
+
// Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
|
|
2707
|
+
const searchLen = chunkLen || CHUNK_SIZE_CHARS;
|
|
2708
|
+
const contextStart = Math.max(0, chunkPos - 100);
|
|
2709
|
+
const contextEnd = Math.min(body.length, chunkPos + searchLen + 100);
|
|
2710
|
+
searchBody = body.slice(contextStart, contextEnd);
|
|
2711
|
+
if (contextStart > 0) {
|
|
2712
|
+
lineOffset = body.slice(0, contextStart).split('\n').length - 1;
|
|
2713
|
+
}
|
|
2714
|
+
}
|
|
2715
|
+
|
|
2716
|
+
const lines = searchBody.split('\n');
|
|
2717
|
+
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 0);
|
|
2718
|
+
let bestLine = 0, bestScore = -1;
|
|
2719
|
+
|
|
2720
|
+
for (let i = 0; i < lines.length; i++) {
|
|
2721
|
+
const lineLower = (lines[i] ?? "").toLowerCase();
|
|
2722
|
+
let score = 0;
|
|
2723
|
+
for (const term of queryTerms) {
|
|
2724
|
+
if (lineLower.includes(term)) score++;
|
|
2725
|
+
}
|
|
2726
|
+
if (score > bestScore) {
|
|
2727
|
+
bestScore = score;
|
|
2728
|
+
bestLine = i;
|
|
2729
|
+
}
|
|
2730
|
+
}
|
|
2731
|
+
|
|
2732
|
+
const start = Math.max(0, bestLine - 1);
|
|
2733
|
+
const end = Math.min(lines.length, bestLine + 3);
|
|
2734
|
+
const snippetLines = lines.slice(start, end);
|
|
2735
|
+
let snippetText = snippetLines.join('\n');
|
|
2736
|
+
|
|
2737
|
+
// If we focused on a chunk window and it produced an empty/whitespace-only snippet,
|
|
2738
|
+
// fall back to a full-document snippet so we always show something useful.
|
|
2739
|
+
if (chunkPos && chunkPos > 0 && snippetText.trim().length === 0) {
|
|
2740
|
+
return extractSnippet(body, query, maxLen, undefined);
|
|
2741
|
+
}
|
|
2742
|
+
|
|
2743
|
+
if (snippetText.length > maxLen) snippetText = snippetText.substring(0, maxLen - 3) + "...";
|
|
2744
|
+
|
|
2745
|
+
const absoluteStart = lineOffset + start + 1; // 1-indexed
|
|
2746
|
+
const snippetLineCount = snippetLines.length;
|
|
2747
|
+
const linesBefore = absoluteStart - 1;
|
|
2748
|
+
const linesAfter = totalLines - (absoluteStart + snippetLineCount - 1);
|
|
2749
|
+
|
|
2750
|
+
// Format with diff-style header: @@ -start,count @@ (linesBefore before, linesAfter after)
|
|
2751
|
+
const header = `@@ -${absoluteStart},${snippetLineCount} @@ (${linesBefore} before, ${linesAfter} after)`;
|
|
2752
|
+
const snippet = `${header}\n${snippetText}`;
|
|
2753
|
+
|
|
2754
|
+
return {
|
|
2755
|
+
line: lineOffset + bestLine + 1,
|
|
2756
|
+
snippet,
|
|
2757
|
+
linesBefore,
|
|
2758
|
+
linesAfter,
|
|
2759
|
+
snippetLines: snippetLineCount,
|
|
2760
|
+
};
|
|
2761
|
+
}
|
|
2762
|
+
|
|
2763
|
+
// =============================================================================
|
|
2764
|
+
// Shared helpers (used by both CLI and MCP)
|
|
2765
|
+
// =============================================================================
|
|
2766
|
+
|
|
2767
|
+
/**
|
|
2768
|
+
* Add line numbers to text content.
|
|
2769
|
+
* Each line becomes: "{lineNum}: {content}"
|
|
2770
|
+
*/
|
|
2771
|
+
export function addLineNumbers(text: string, startLine: number = 1): string {
|
|
2772
|
+
const lines = text.split('\n');
|
|
2773
|
+
return lines.map((line, i) => `${startLine + i}: ${line}`).join('\n');
|
|
2774
|
+
}
|
|
2775
|
+
|
|
2776
|
+
// =============================================================================
|
|
2777
|
+
// Shared search orchestration
|
|
2778
|
+
//
|
|
2779
|
+
// hybridQuery() and vectorSearchQuery() are standalone functions (not Store
|
|
2780
|
+
// methods) because they are orchestration over primitives — same rationale as
|
|
2781
|
+
// reciprocalRankFusion(). They take a Store as first argument so both CLI
|
|
2782
|
+
// and MCP can share the identical pipeline.
|
|
2783
|
+
// =============================================================================
|
|
2784
|
+
|
|
2785
|
+
/**
|
|
2786
|
+
* Optional progress hooks for search orchestration.
|
|
2787
|
+
* CLI wires these to stderr for user feedback; MCP leaves them unset.
|
|
2788
|
+
*/
|
|
2789
|
+
export interface SearchHooks {
|
|
2790
|
+
/** BM25 probe found strong signal — expansion will be skipped */
|
|
2791
|
+
onStrongSignal?: (topScore: number) => void;
|
|
2792
|
+
/** Query expansion complete. Empty array = strong signal skip (no expansion). */
|
|
2793
|
+
onExpand?: (original: string, expanded: ExpandedQuery[]) => void;
|
|
2794
|
+
/** Reranking is about to start */
|
|
2795
|
+
onRerankStart?: (chunkCount: number) => void;
|
|
2796
|
+
/** Reranking finished */
|
|
2797
|
+
onRerankDone?: () => void;
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
export interface HybridQueryOptions {
|
|
2801
|
+
collection?: string;
|
|
2802
|
+
limit?: number; // default 10
|
|
2803
|
+
minScore?: number; // default 0
|
|
2804
|
+
candidateLimit?: number; // default RERANK_CANDIDATE_LIMIT
|
|
2805
|
+
hooks?: SearchHooks;
|
|
2806
|
+
}
|
|
2807
|
+
|
|
2808
|
+
export interface HybridQueryResult {
|
|
2809
|
+
file: string; // internal filepath (qmd://collection/path)
|
|
2810
|
+
displayPath: string;
|
|
2811
|
+
title: string;
|
|
2812
|
+
body: string; // full document body (for snippet extraction)
|
|
2813
|
+
bestChunk: string; // best chunk text
|
|
2814
|
+
bestChunkPos: number; // char offset of best chunk in body
|
|
2815
|
+
score: number; // blended score (full precision)
|
|
2816
|
+
context: string | null; // user-set context
|
|
2817
|
+
docid: string; // content hash prefix (6 chars)
|
|
2818
|
+
}
|
|
2819
|
+
|
|
2820
|
+
/**
|
|
2821
|
+
* Hybrid search: BM25 + vector + query expansion + RRF + chunked reranking.
|
|
2822
|
+
*
|
|
2823
|
+
* Pipeline:
|
|
2824
|
+
* 1. BM25 probe → skip expansion if strong signal
|
|
2825
|
+
* 2. expandQuery() → typed query variants (lex/vec/hyde)
|
|
2826
|
+
* 3. Type-routed search: original→vector, lex→FTS, vec/hyde→vector
|
|
2827
|
+
* 4. RRF fusion → slice to candidateLimit
|
|
2828
|
+
* 5. chunkDocument() + keyword-best-chunk selection
|
|
2829
|
+
* 6. rerank on chunks (NOT full bodies — O(tokens) trap)
|
|
2830
|
+
* 7. Position-aware score blending (RRF rank × reranker score)
|
|
2831
|
+
* 8. Dedup by file, filter by minScore, slice to limit
|
|
2832
|
+
*/
|
|
2833
|
+
export async function hybridQuery(
|
|
2834
|
+
store: Store,
|
|
2835
|
+
query: string,
|
|
2836
|
+
options?: HybridQueryOptions
|
|
2837
|
+
): Promise<HybridQueryResult[]> {
|
|
2838
|
+
const limit = options?.limit ?? 10;
|
|
2839
|
+
const minScore = options?.minScore ?? 0;
|
|
2840
|
+
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
|
|
2841
|
+
const collection = options?.collection;
|
|
2842
|
+
const hooks = options?.hooks;
|
|
2843
|
+
|
|
2844
|
+
const rankedLists: RankedResult[][] = [];
|
|
2845
|
+
const docidMap = new Map<string, string>(); // filepath -> docid
|
|
2846
|
+
const hasVectors = !!store.db.prepare(
|
|
2847
|
+
`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
|
|
2848
|
+
).get();
|
|
2849
|
+
|
|
2850
|
+
// Step 1: BM25 probe — strong signal skips expensive LLM expansion
|
|
2851
|
+
const initialFts = store.searchFTS(query, 20)
|
|
2852
|
+
.filter(r => !collection || r.collectionName === collection);
|
|
2853
|
+
const topScore = initialFts[0]?.score ?? 0;
|
|
2854
|
+
const secondScore = initialFts[1]?.score ?? 0;
|
|
2855
|
+
const hasStrongSignal = initialFts.length > 0
|
|
2856
|
+
&& topScore >= STRONG_SIGNAL_MIN_SCORE
|
|
2857
|
+
&& (topScore - secondScore) >= STRONG_SIGNAL_MIN_GAP;
|
|
2858
|
+
|
|
2859
|
+
if (hasStrongSignal) hooks?.onStrongSignal?.(topScore);
|
|
2860
|
+
|
|
2861
|
+
// Step 2: Expand query (or skip if strong signal)
|
|
2862
|
+
const expanded = hasStrongSignal
|
|
2863
|
+
? []
|
|
2864
|
+
: await store.expandQuery(query);
|
|
2865
|
+
|
|
2866
|
+
hooks?.onExpand?.(query, expanded);
|
|
2867
|
+
|
|
2868
|
+
// Seed with initial FTS results (avoid re-running original query FTS)
|
|
2869
|
+
if (initialFts.length > 0) {
|
|
2870
|
+
for (const r of initialFts) docidMap.set(r.filepath, r.docid);
|
|
2871
|
+
rankedLists.push(initialFts.map(r => ({
|
|
2872
|
+
file: r.filepath, displayPath: r.displayPath,
|
|
2873
|
+
title: r.title, body: r.body || "", score: r.score,
|
|
2874
|
+
})));
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2877
|
+
// Step 3: Route searches by query type
|
|
2878
|
+
// Original query → vector search (FTS already covered by probe in step 1).
|
|
2879
|
+
// Vector searches run sequentially — node-llama-cpp's embed context
|
|
2880
|
+
// hangs on concurrent embed() calls (known limitation).
|
|
2881
|
+
if (hasVectors) {
|
|
2882
|
+
const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, 20, collection);
|
|
2883
|
+
if (vecResults.length > 0) {
|
|
2884
|
+
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
|
|
2885
|
+
rankedLists.push(vecResults.map(r => ({
|
|
2886
|
+
file: r.filepath, displayPath: r.displayPath,
|
|
2887
|
+
title: r.title, body: r.body || "", score: r.score,
|
|
2888
|
+
})));
|
|
2889
|
+
}
|
|
2890
|
+
}
|
|
2891
|
+
|
|
2892
|
+
// Expanded queries → route by type: lex→FTS only, vec/hyde→vector only.
|
|
2893
|
+
// This restores the CLI's query-type-aware routing that was lost in the initial refactor.
|
|
2894
|
+
for (const q of expanded) {
|
|
2895
|
+
if (q.type === 'lex') {
|
|
2896
|
+
const ftsResults = store.searchFTS(q.text, 20)
|
|
2897
|
+
.filter(r => !collection || r.collectionName === collection);
|
|
2898
|
+
if (ftsResults.length > 0) {
|
|
2899
|
+
for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
|
|
2900
|
+
rankedLists.push(ftsResults.map(r => ({
|
|
2901
|
+
file: r.filepath, displayPath: r.displayPath,
|
|
2902
|
+
title: r.title, body: r.body || "", score: r.score,
|
|
2903
|
+
})));
|
|
2904
|
+
}
|
|
2905
|
+
} else {
|
|
2906
|
+
// vec or hyde → vector search only
|
|
2907
|
+
if (hasVectors) {
|
|
2908
|
+
const vecResults = await store.searchVec(q.text, DEFAULT_EMBED_MODEL, 20, collection);
|
|
2909
|
+
if (vecResults.length > 0) {
|
|
2910
|
+
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
|
|
2911
|
+
rankedLists.push(vecResults.map(r => ({
|
|
2912
|
+
file: r.filepath, displayPath: r.displayPath,
|
|
2913
|
+
title: r.title, body: r.body || "", score: r.score,
|
|
2914
|
+
})));
|
|
2915
|
+
}
|
|
2916
|
+
}
|
|
2917
|
+
}
|
|
2918
|
+
}
|
|
2919
|
+
|
|
2920
|
+
// Step 4: RRF fusion — first 2 lists (original FTS + first vec) get 2x weight
|
|
2921
|
+
const weights = rankedLists.map((_, i) => i < 2 ? 2.0 : 1.0);
|
|
2922
|
+
const fused = reciprocalRankFusion(rankedLists, weights);
|
|
2923
|
+
const candidates = fused.slice(0, candidateLimit);
|
|
2924
|
+
|
|
2925
|
+
if (candidates.length === 0) return [];
|
|
2926
|
+
|
|
2927
|
+
// Step 5: Chunk documents, pick best chunk per doc for reranking.
|
|
2928
|
+
// Reranking full bodies is O(tokens) — the critical perf lesson that motivated this refactor.
|
|
2929
|
+
const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
|
|
2930
|
+
const chunksToRerank: { file: string; text: string }[] = [];
|
|
2931
|
+
const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestIdx: number }>();
|
|
2932
|
+
|
|
2933
|
+
for (const cand of candidates) {
|
|
2934
|
+
const chunks = chunkDocument(cand.body);
|
|
2935
|
+
if (chunks.length === 0) continue;
|
|
2936
|
+
|
|
2937
|
+
// Pick chunk with most keyword overlap (fallback: first chunk)
|
|
2938
|
+
let bestIdx = 0;
|
|
2939
|
+
let bestScore = -1;
|
|
2940
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
2941
|
+
const chunkLower = chunks[i]!.text.toLowerCase();
|
|
2942
|
+
const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
|
|
2943
|
+
if (score > bestScore) { bestScore = score; bestIdx = i; }
|
|
2944
|
+
}
|
|
2945
|
+
|
|
2946
|
+
chunksToRerank.push({ file: cand.file, text: chunks[bestIdx]!.text });
|
|
2947
|
+
docChunkMap.set(cand.file, { chunks, bestIdx });
|
|
2948
|
+
}
|
|
2949
|
+
|
|
2950
|
+
// Step 6: Rerank chunks (NOT full bodies)
|
|
2951
|
+
hooks?.onRerankStart?.(chunksToRerank.length);
|
|
2952
|
+
const reranked = await store.rerank(query, chunksToRerank);
|
|
2953
|
+
hooks?.onRerankDone?.();
|
|
2954
|
+
|
|
2955
|
+
// Step 7: Blend RRF position score with reranker score
|
|
2956
|
+
// Position-aware weights: top retrieval results get more protection from reranker disagreement
|
|
2957
|
+
const candidateMap = new Map(candidates.map(c => [c.file, {
|
|
2958
|
+
displayPath: c.displayPath, title: c.title, body: c.body,
|
|
2959
|
+
}]));
|
|
2960
|
+
const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1]));
|
|
2961
|
+
|
|
2962
|
+
const blended = reranked.map(r => {
|
|
2963
|
+
const rrfRank = rrfRankMap.get(r.file) || candidateLimit;
|
|
2964
|
+
let rrfWeight: number;
|
|
2965
|
+
if (rrfRank <= 3) rrfWeight = 0.75;
|
|
2966
|
+
else if (rrfRank <= 10) rrfWeight = 0.60;
|
|
2967
|
+
else rrfWeight = 0.40;
|
|
2968
|
+
const rrfScore = 1 / rrfRank;
|
|
2969
|
+
const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
|
|
2970
|
+
|
|
2971
|
+
const candidate = candidateMap.get(r.file);
|
|
2972
|
+
const chunkInfo = docChunkMap.get(r.file);
|
|
2973
|
+
const bestIdx = chunkInfo?.bestIdx ?? 0;
|
|
2974
|
+
const bestChunk = chunkInfo?.chunks[bestIdx]?.text || candidate?.body || "";
|
|
2975
|
+
const bestChunkPos = chunkInfo?.chunks[bestIdx]?.pos || 0;
|
|
2976
|
+
|
|
2977
|
+
return {
|
|
2978
|
+
file: r.file,
|
|
2979
|
+
displayPath: candidate?.displayPath || "",
|
|
2980
|
+
title: candidate?.title || "",
|
|
2981
|
+
body: candidate?.body || "",
|
|
2982
|
+
bestChunk,
|
|
2983
|
+
bestChunkPos,
|
|
2984
|
+
score: blendedScore,
|
|
2985
|
+
context: store.getContextForFile(r.file),
|
|
2986
|
+
docid: docidMap.get(r.file) || "",
|
|
2987
|
+
};
|
|
2988
|
+
}).sort((a, b) => b.score - a.score);
|
|
2989
|
+
|
|
2990
|
+
// Step 8: Dedup by file (safety net — prevents duplicate output)
|
|
2991
|
+
const seenFiles = new Set<string>();
|
|
2992
|
+
return blended
|
|
2993
|
+
.filter(r => {
|
|
2994
|
+
if (seenFiles.has(r.file)) return false;
|
|
2995
|
+
seenFiles.add(r.file);
|
|
2996
|
+
return true;
|
|
2997
|
+
})
|
|
2998
|
+
.filter(r => r.score >= minScore)
|
|
2999
|
+
.slice(0, limit);
|
|
3000
|
+
}
|
|
3001
|
+
|
|
3002
|
+
export interface VectorSearchOptions {
|
|
3003
|
+
collection?: string;
|
|
3004
|
+
limit?: number; // default 10
|
|
3005
|
+
minScore?: number; // default 0.3
|
|
3006
|
+
hooks?: Pick<SearchHooks, 'onExpand'>;
|
|
3007
|
+
}
|
|
3008
|
+
|
|
3009
|
+
export interface VectorSearchResult {
|
|
3010
|
+
file: string;
|
|
3011
|
+
displayPath: string;
|
|
3012
|
+
title: string;
|
|
3013
|
+
body: string;
|
|
3014
|
+
score: number;
|
|
3015
|
+
context: string | null;
|
|
3016
|
+
docid: string;
|
|
3017
|
+
}
|
|
3018
|
+
|
|
3019
|
+
/**
|
|
3020
|
+
* Vector-only semantic search with query expansion.
|
|
3021
|
+
*
|
|
3022
|
+
* Pipeline:
|
|
3023
|
+
* 1. expandQuery() → typed variants, filter to vec/hyde only (lex irrelevant here)
|
|
3024
|
+
* 2. searchVec() for original + vec/hyde variants (sequential — node-llama-cpp embed limitation)
|
|
3025
|
+
* 3. Dedup by filepath (keep max score)
|
|
3026
|
+
* 4. Sort by score descending, filter by minScore, slice to limit
|
|
3027
|
+
*/
|
|
3028
|
+
export async function vectorSearchQuery(
|
|
3029
|
+
store: Store,
|
|
3030
|
+
query: string,
|
|
3031
|
+
options?: VectorSearchOptions
|
|
3032
|
+
): Promise<VectorSearchResult[]> {
|
|
3033
|
+
const limit = options?.limit ?? 10;
|
|
3034
|
+
const minScore = options?.minScore ?? 0.3;
|
|
3035
|
+
const collection = options?.collection;
|
|
3036
|
+
|
|
3037
|
+
const hasVectors = !!store.db.prepare(
|
|
3038
|
+
`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
|
|
3039
|
+
).get();
|
|
3040
|
+
if (!hasVectors) return [];
|
|
3041
|
+
|
|
3042
|
+
// Expand query — filter to vec/hyde only (lex queries target FTS, not vector)
|
|
3043
|
+
const allExpanded = await store.expandQuery(query);
|
|
3044
|
+
const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
|
|
3045
|
+
options?.hooks?.onExpand?.(query, vecExpanded);
|
|
3046
|
+
|
|
3047
|
+
// Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
|
|
3048
|
+
const queryTexts = [query, ...vecExpanded.map(q => q.text)];
|
|
3049
|
+
const allResults = new Map<string, VectorSearchResult>();
|
|
3050
|
+
for (const q of queryTexts) {
|
|
3051
|
+
const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
|
|
3052
|
+
for (const r of vecResults) {
|
|
3053
|
+
const existing = allResults.get(r.filepath);
|
|
3054
|
+
if (!existing || r.score > existing.score) {
|
|
3055
|
+
allResults.set(r.filepath, {
|
|
3056
|
+
file: r.filepath,
|
|
3057
|
+
displayPath: r.displayPath,
|
|
3058
|
+
title: r.title,
|
|
3059
|
+
body: r.body || "",
|
|
3060
|
+
score: r.score,
|
|
3061
|
+
context: store.getContextForFile(r.filepath),
|
|
3062
|
+
docid: r.docid,
|
|
3063
|
+
});
|
|
3064
|
+
}
|
|
3065
|
+
}
|
|
3066
|
+
}
|
|
3067
|
+
|
|
3068
|
+
return Array.from(allResults.values())
|
|
3069
|
+
.sort((a, b) => b.score - a.score)
|
|
3070
|
+
.filter(r => r.score >= minScore)
|
|
3071
|
+
.slice(0, limit);
|
|
3072
|
+
}
|