ctxpkg 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/README.md +282 -0
- package/bin/cli.js +8 -0
- package/bin/daemon.js +7 -0
- package/package.json +70 -0
- package/src/agent/AGENTS.md +249 -0
- package/src/agent/agent.prompts.ts +66 -0
- package/src/agent/agent.test-runner.schemas.ts +158 -0
- package/src/agent/agent.test-runner.ts +436 -0
- package/src/agent/agent.ts +371 -0
- package/src/agent/agent.types.ts +94 -0
- package/src/backend/AGENTS.md +112 -0
- package/src/backend/backend.protocol.ts +95 -0
- package/src/backend/backend.schemas.ts +123 -0
- package/src/backend/backend.services.ts +151 -0
- package/src/backend/backend.ts +111 -0
- package/src/backend/backend.types.ts +34 -0
- package/src/cli/AGENTS.md +213 -0
- package/src/cli/cli.agent.ts +197 -0
- package/src/cli/cli.chat.ts +369 -0
- package/src/cli/cli.client.ts +55 -0
- package/src/cli/cli.collections.ts +491 -0
- package/src/cli/cli.config.ts +252 -0
- package/src/cli/cli.daemon.ts +160 -0
- package/src/cli/cli.documents.ts +413 -0
- package/src/cli/cli.mcp.ts +177 -0
- package/src/cli/cli.ts +28 -0
- package/src/cli/cli.utils.ts +122 -0
- package/src/client/AGENTS.md +135 -0
- package/src/client/client.adapters.ts +279 -0
- package/src/client/client.ts +86 -0
- package/src/client/client.types.ts +17 -0
- package/src/collections/AGENTS.md +185 -0
- package/src/collections/collections.schemas.ts +195 -0
- package/src/collections/collections.ts +1160 -0
- package/src/config/config.ts +118 -0
- package/src/daemon/AGENTS.md +168 -0
- package/src/daemon/daemon.config.ts +23 -0
- package/src/daemon/daemon.manager.ts +215 -0
- package/src/daemon/daemon.schemas.ts +22 -0
- package/src/daemon/daemon.ts +205 -0
- package/src/database/AGENTS.md +211 -0
- package/src/database/database.ts +64 -0
- package/src/database/migrations/migrations.001-init.ts +56 -0
- package/src/database/migrations/migrations.002-fts5.ts +32 -0
- package/src/database/migrations/migrations.ts +20 -0
- package/src/database/migrations/migrations.types.ts +9 -0
- package/src/documents/AGENTS.md +301 -0
- package/src/documents/documents.schemas.ts +190 -0
- package/src/documents/documents.ts +734 -0
- package/src/embedder/embedder.ts +53 -0
- package/src/exports.ts +0 -0
- package/src/mcp/AGENTS.md +264 -0
- package/src/mcp/mcp.ts +105 -0
- package/src/tools/AGENTS.md +228 -0
- package/src/tools/agent/agent.ts +45 -0
- package/src/tools/documents/documents.ts +401 -0
- package/src/tools/tools.langchain.ts +37 -0
- package/src/tools/tools.mcp.ts +46 -0
- package/src/tools/tools.types.ts +35 -0
- package/src/utils/utils.services.ts +46 -0
|
@@ -0,0 +1,734 @@
|
|
|
1
|
+
import { createHash, randomUUID } from 'node:crypto';
|
|
2
|
+
import { glob, readFile } from 'node:fs/promises';
|
|
3
|
+
import { resolve } from 'node:path';
|
|
4
|
+
|
|
5
|
+
import { TokenTextSplitter } from '@langchain/textsplitters';
|
|
6
|
+
import { type FeatureExtractionPipeline, pipeline, cos_sim } from '@huggingface/transformers';
|
|
7
|
+
|
|
8
|
+
import {
|
|
9
|
+
searchChunkItemSchema,
|
|
10
|
+
type ReferenceDocument,
|
|
11
|
+
type SearchChunksOptions,
|
|
12
|
+
type SearchChunkItem,
|
|
13
|
+
type ListDocumentsParams,
|
|
14
|
+
type ListDocumentsResult,
|
|
15
|
+
type GetOutlineParams,
|
|
16
|
+
type OutlineResult,
|
|
17
|
+
type OutlineItem,
|
|
18
|
+
type GetSectionParams,
|
|
19
|
+
type SectionResult,
|
|
20
|
+
type FindRelatedParams,
|
|
21
|
+
type SearchBatchParams,
|
|
22
|
+
type SearchBatchResult,
|
|
23
|
+
} from './documents.schemas.ts';
|
|
24
|
+
|
|
25
|
+
import type { Services } from '#root/utils/utils.services.ts';
|
|
26
|
+
import { DatabaseService, tableNames } from '#root/database/database.ts';
|
|
27
|
+
import { EmbedderService } from '#root/embedder/embedder.ts';
|
|
28
|
+
|
|
29
|
+
// Chunking configuration
|
|
30
|
+
const CHUNK_SIZE = 400;
|
|
31
|
+
const CHUNK_OVERLAP = 80;
|
|
32
|
+
|
|
33
|
+
// Search configuration
|
|
34
|
+
const RRF_K = 60; // Reciprocal Rank Fusion constant
|
|
35
|
+
const RERANK_CANDIDATES_MULTIPLIER = 3; // Fetch 3x candidates for re-ranking
|
|
36
|
+
|
|
37
|
+
class DocumentsService {
|
|
38
|
+
#services: Services;
|
|
39
|
+
#reranker?: Promise<FeatureExtractionPipeline>;
|
|
40
|
+
|
|
41
|
+
constructor(services: Services) {
|
|
42
|
+
this.#services = services;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Lazily initialize the re-ranker model.
|
|
47
|
+
* Uses a smaller, faster model for re-ranking candidates.
|
|
48
|
+
*/
|
|
49
|
+
#getReranker = async (): Promise<FeatureExtractionPipeline> => {
|
|
50
|
+
if (!this.#reranker) {
|
|
51
|
+
// Use a smaller model for fast re-ranking
|
|
52
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
53
|
+
const loadPipeline = pipeline as any;
|
|
54
|
+
this.#reranker = loadPipeline(
|
|
55
|
+
'feature-extraction',
|
|
56
|
+
'Xenova/all-MiniLM-L6-v2',
|
|
57
|
+
) as Promise<FeatureExtractionPipeline>;
|
|
58
|
+
}
|
|
59
|
+
return this.#reranker;
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Extract document title from markdown content.
|
|
64
|
+
*/
|
|
65
|
+
#extractTitle = (content: string, fallback: string): string => {
|
|
66
|
+
const titleMatch = content.match(/^#\s+(.+)$/m);
|
|
67
|
+
return titleMatch?.[1]?.trim() || fallback;
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Extract the nearest preceding heading for a chunk position.
|
|
72
|
+
*/
|
|
73
|
+
#extractSectionHeading = (content: string, chunkStart: number): string | null => {
|
|
74
|
+
const beforeChunk = content.slice(0, chunkStart);
|
|
75
|
+
const headings = beforeChunk.match(/^#{1,6}\s+.+$/gm);
|
|
76
|
+
return headings?.[headings.length - 1]?.replace(/^#+\s+/, '') || null;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
public listCollections = async () => {
|
|
80
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
81
|
+
const database = await databaseService.getInstance();
|
|
82
|
+
|
|
83
|
+
// Get document counts per collection
|
|
84
|
+
const docCounts = await database(tableNames.referenceDocuments)
|
|
85
|
+
.select('collection', database.raw('COUNT(*) as document_count'))
|
|
86
|
+
.groupBy('collection')
|
|
87
|
+
.orderBy('collection', 'asc');
|
|
88
|
+
|
|
89
|
+
// Get collection metadata (description, version) from collections table
|
|
90
|
+
const collectionIds = docCounts.map((c) => c.collection);
|
|
91
|
+
const collectionMeta = await database(tableNames.collections)
|
|
92
|
+
.select('id', 'description', 'version')
|
|
93
|
+
.whereIn('id', collectionIds);
|
|
94
|
+
|
|
95
|
+
// Build a map of collection ID -> metadata
|
|
96
|
+
const metaMap = new Map<string, { description: string | null; version: string | null }>();
|
|
97
|
+
for (const meta of collectionMeta) {
|
|
98
|
+
metaMap.set(meta.id, { description: meta.description, version: meta.version });
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Merge counts with metadata
|
|
102
|
+
return docCounts.map((c) => {
|
|
103
|
+
const meta = metaMap.get(c.collection);
|
|
104
|
+
return {
|
|
105
|
+
collection: c.collection,
|
|
106
|
+
document_count: c.document_count,
|
|
107
|
+
description: meta?.description ?? null,
|
|
108
|
+
version: meta?.version ?? null,
|
|
109
|
+
};
|
|
110
|
+
});
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
public dropCollection = async (collection: string) => {
|
|
114
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
115
|
+
const database = await databaseService.getInstance();
|
|
116
|
+
|
|
117
|
+
await database.transaction(async (trx) => {
|
|
118
|
+
await trx(tableNames.referenceDocumentChunks).delete().where({ collection });
|
|
119
|
+
await trx(tableNames.referenceDocumentChunksFts).delete().where({ collection });
|
|
120
|
+
await trx(tableNames.referenceDocuments).delete().where({ collection });
|
|
121
|
+
});
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
public updateCollectionFromGlob = async (options: { pattern: string; cwd: string; collection?: string }) => {
|
|
125
|
+
const { pattern, collection, cwd } = options;
|
|
126
|
+
for await (const file of glob(pattern, { cwd })) {
|
|
127
|
+
const fullPath = resolve(cwd, file);
|
|
128
|
+
const content = await readFile(fullPath, 'utf8');
|
|
129
|
+
await this.updateDocument({
|
|
130
|
+
collection: collection || cwd,
|
|
131
|
+
id: file,
|
|
132
|
+
content,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
public updateDocument = async (document: ReferenceDocument) => {
|
|
138
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
139
|
+
const database = await databaseService.getInstance();
|
|
140
|
+
const hash = createHash('sha256').update(document.content).digest('hex');
|
|
141
|
+
const [current] = await database(tableNames.referenceDocuments)
|
|
142
|
+
.where({ collection: document.collection, id: document.id })
|
|
143
|
+
.limit(1);
|
|
144
|
+
|
|
145
|
+
if (current && current.hash === hash) {
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
await database.transaction(async (trx) => {
|
|
150
|
+
// Clean up existing chunks (both vector and FTS)
|
|
151
|
+
if (current) {
|
|
152
|
+
await trx(tableNames.referenceDocumentChunks).delete().where({
|
|
153
|
+
collection: document.collection,
|
|
154
|
+
document: document.id,
|
|
155
|
+
});
|
|
156
|
+
await trx(tableNames.referenceDocumentChunksFts).delete().where({
|
|
157
|
+
collection: document.collection,
|
|
158
|
+
document: document.id,
|
|
159
|
+
});
|
|
160
|
+
await trx(tableNames.referenceDocuments)
|
|
161
|
+
.update({
|
|
162
|
+
hash,
|
|
163
|
+
content: document.content,
|
|
164
|
+
})
|
|
165
|
+
.where({
|
|
166
|
+
collection: document.collection,
|
|
167
|
+
id: document.id,
|
|
168
|
+
});
|
|
169
|
+
} else {
|
|
170
|
+
await trx(tableNames.referenceDocuments).insert({
|
|
171
|
+
collection: document.collection,
|
|
172
|
+
id: document.id,
|
|
173
|
+
hash,
|
|
174
|
+
content: document.content,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Create chunks with improved settings
|
|
179
|
+
const splitter = new TokenTextSplitter({
|
|
180
|
+
encodingName: 'cl100k_base',
|
|
181
|
+
chunkSize: CHUNK_SIZE,
|
|
182
|
+
chunkOverlap: CHUNK_OVERLAP,
|
|
183
|
+
});
|
|
184
|
+
const chunks = (await splitter.createDocuments([document.content])) as {
|
|
185
|
+
pageContent: string;
|
|
186
|
+
metadata: { loc: { lines: { from: number; to: number } } };
|
|
187
|
+
}[];
|
|
188
|
+
|
|
189
|
+
// Extract document title for context
|
|
190
|
+
const title = this.#extractTitle(document.content, document.id);
|
|
191
|
+
|
|
192
|
+
// Create contextualized chunks with document and section context
|
|
193
|
+
const contextualizedChunks = chunks.map((chunk) => {
|
|
194
|
+
// Find the character position approximately (using line info if available)
|
|
195
|
+
const lines = document.content.split('\n');
|
|
196
|
+
let charPos = 0;
|
|
197
|
+
const startLine = chunk.metadata?.loc?.lines?.from ?? 0;
|
|
198
|
+
for (let i = 0; i < startLine && i < lines.length; i++) {
|
|
199
|
+
charPos += lines[i].length + 1;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const sectionHeading = this.#extractSectionHeading(document.content, charPos);
|
|
203
|
+
|
|
204
|
+
// Build context prefix
|
|
205
|
+
let contextPrefix = `Document: ${title}`;
|
|
206
|
+
if (sectionHeading && sectionHeading !== title) {
|
|
207
|
+
contextPrefix += `\nSection: ${sectionHeading}`;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
// Embed with context for better semantic understanding
|
|
212
|
+
textForEmbedding: `${contextPrefix}\n\n${chunk.pageContent}`,
|
|
213
|
+
// Store original content for display
|
|
214
|
+
originalContent: chunk.pageContent,
|
|
215
|
+
};
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
// Create embeddings using document embedding method (no query instruction)
|
|
219
|
+
const embedder = this.#services.get(EmbedderService);
|
|
220
|
+
const embeddings = await embedder.createDocumentEmbeddings(contextualizedChunks.map((c) => c.textForEmbedding));
|
|
221
|
+
|
|
222
|
+
// Insert chunks into vector table
|
|
223
|
+
const chunkRecords = embeddings.map((embedding, i) => ({
|
|
224
|
+
id: randomUUID(),
|
|
225
|
+
collection: document.collection,
|
|
226
|
+
document: document.id,
|
|
227
|
+
content: contextualizedChunks[i].originalContent,
|
|
228
|
+
embedding: JSON.stringify(embedding),
|
|
229
|
+
}));
|
|
230
|
+
|
|
231
|
+
await trx(tableNames.referenceDocumentChunks).insert(chunkRecords);
|
|
232
|
+
|
|
233
|
+
// Insert into FTS5 table for hybrid search
|
|
234
|
+
await trx(tableNames.referenceDocumentChunksFts).insert(
|
|
235
|
+
chunkRecords.map((record) => ({
|
|
236
|
+
id: record.id,
|
|
237
|
+
collection: record.collection,
|
|
238
|
+
document: record.document,
|
|
239
|
+
content: record.content,
|
|
240
|
+
})),
|
|
241
|
+
);
|
|
242
|
+
});
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
public search = async (options: SearchChunksOptions): Promise<SearchChunkItem[]> => {
|
|
246
|
+
const { query, collections, limit = 10, maxDistance, hybridSearch = true, rerank = false } = options;
|
|
247
|
+
|
|
248
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
249
|
+
const database = await databaseService.getInstance();
|
|
250
|
+
|
|
251
|
+
// Determine how many candidates to fetch
|
|
252
|
+
const candidateLimit = rerank ? limit * RERANK_CANDIDATES_MULTIPLIER : limit;
|
|
253
|
+
|
|
254
|
+
// 1. Vector similarity search using cosine distance
|
|
255
|
+
const embedder = this.#services.get(EmbedderService);
|
|
256
|
+
const queryEmbedding = await embedder.createQueryEmbedding(query);
|
|
257
|
+
|
|
258
|
+
// Build vector search query
|
|
259
|
+
// Note: We use a subquery to filter by computed distance since SQLite
|
|
260
|
+
// doesn't support HAVING without GROUP BY
|
|
261
|
+
let vectorQuery = database(tableNames.referenceDocumentChunks)
|
|
262
|
+
.select('id', 'collection', 'document', 'content')
|
|
263
|
+
.select(database.raw('vec_distance_cosine(?, embedding) as distance', [JSON.stringify(queryEmbedding)]));
|
|
264
|
+
|
|
265
|
+
if (collections) {
|
|
266
|
+
vectorQuery = vectorQuery.whereIn('collection', collections);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
vectorQuery = vectorQuery.orderBy('distance', 'asc').limit(candidateLimit);
|
|
270
|
+
|
|
271
|
+
let vectorResults = await vectorQuery;
|
|
272
|
+
|
|
273
|
+
// Filter by maxDistance if specified (done in JS since SQLite doesn't support HAVING on computed columns)
|
|
274
|
+
if (maxDistance !== undefined) {
|
|
275
|
+
vectorResults = vectorResults.filter((row) => row.distance <= maxDistance);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// 2. Keyword search using FTS5 (if hybrid search enabled)
|
|
279
|
+
let keywordResults: { id: string; collection: string; document: string; content: string; rank: number }[] = [];
|
|
280
|
+
|
|
281
|
+
if (hybridSearch) {
|
|
282
|
+
// Escape special FTS5 characters and create search query
|
|
283
|
+
const ftsQuery = query
|
|
284
|
+
.replace(/['"(){}[\]*:^~\\]/g, ' ')
|
|
285
|
+
.split(/\s+/)
|
|
286
|
+
.filter((term) => term.length > 0)
|
|
287
|
+
.map((term) => `"${term}"`)
|
|
288
|
+
.join(' OR ');
|
|
289
|
+
|
|
290
|
+
if (ftsQuery) {
|
|
291
|
+
let ftsDbQuery = database(tableNames.referenceDocumentChunksFts)
|
|
292
|
+
.select('id', 'collection', 'document', 'content')
|
|
293
|
+
.select(database.raw('rank as rank'))
|
|
294
|
+
.whereRaw(`${tableNames.referenceDocumentChunksFts} MATCH ?`, [ftsQuery]);
|
|
295
|
+
|
|
296
|
+
if (collections) {
|
|
297
|
+
ftsDbQuery = ftsDbQuery.whereIn('collection', collections);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
ftsDbQuery = ftsDbQuery.orderBy('rank', 'asc').limit(candidateLimit);
|
|
301
|
+
|
|
302
|
+
try {
|
|
303
|
+
keywordResults = await ftsDbQuery;
|
|
304
|
+
} catch {
|
|
305
|
+
// FTS query might fail for edge cases, fall back to vector-only
|
|
306
|
+
keywordResults = [];
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// 3. Merge results using Reciprocal Rank Fusion (RRF)
|
|
312
|
+
let mergedResults: SearchChunkItem[];
|
|
313
|
+
|
|
314
|
+
if (hybridSearch && keywordResults.length > 0) {
|
|
315
|
+
mergedResults = this.#reciprocalRankFusion(vectorResults, keywordResults, candidateLimit);
|
|
316
|
+
} else {
|
|
317
|
+
// Vector-only results
|
|
318
|
+
mergedResults = vectorResults.map((row) => ({
|
|
319
|
+
id: row.id,
|
|
320
|
+
document: row.document,
|
|
321
|
+
collection: row.collection,
|
|
322
|
+
content: row.content,
|
|
323
|
+
distance: row.distance,
|
|
324
|
+
score: 1 / (RRF_K + 1), // Single source score
|
|
325
|
+
}));
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// 4. Re-rank using cross-encoder (if enabled)
|
|
329
|
+
if (rerank && mergedResults.length > 0) {
|
|
330
|
+
mergedResults = await this.#rerankResults(query, mergedResults);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// 5. Apply final limit and return
|
|
334
|
+
return mergedResults.slice(0, limit).map((row) => searchChunkItemSchema.parse(row));
|
|
335
|
+
};
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* Merge vector and keyword search results using Reciprocal Rank Fusion.
|
|
339
|
+
* RRF score = sum(1 / (k + rank)) for each result across all rankings.
|
|
340
|
+
*/
|
|
341
|
+
#reciprocalRankFusion = (
|
|
342
|
+
vectorResults: { id: string; collection: string; document: string; content: string; distance: number }[],
|
|
343
|
+
keywordResults: { id: string; collection: string; document: string; content: string; rank: number }[],
|
|
344
|
+
limit: number,
|
|
345
|
+
): SearchChunkItem[] => {
|
|
346
|
+
const scoreMap = new Map<string, { item: SearchChunkItem; score: number }>();
|
|
347
|
+
|
|
348
|
+
// Add vector results with RRF scores
|
|
349
|
+
vectorResults.forEach((item, rank) => {
|
|
350
|
+
const rrfScore = 1 / (RRF_K + rank + 1);
|
|
351
|
+
scoreMap.set(item.id, {
|
|
352
|
+
item: {
|
|
353
|
+
id: item.id,
|
|
354
|
+
document: item.document,
|
|
355
|
+
collection: item.collection,
|
|
356
|
+
content: item.content,
|
|
357
|
+
distance: item.distance,
|
|
358
|
+
},
|
|
359
|
+
score: rrfScore,
|
|
360
|
+
});
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
// Add keyword results with RRF scores
|
|
364
|
+
keywordResults.forEach((item, rank) => {
|
|
365
|
+
const rrfScore = 1 / (RRF_K + rank + 1);
|
|
366
|
+
const existing = scoreMap.get(item.id);
|
|
367
|
+
|
|
368
|
+
if (existing) {
|
|
369
|
+
// Combine scores if item appears in both result sets
|
|
370
|
+
existing.score += rrfScore;
|
|
371
|
+
} else {
|
|
372
|
+
scoreMap.set(item.id, {
|
|
373
|
+
item: {
|
|
374
|
+
id: item.id,
|
|
375
|
+
document: item.document,
|
|
376
|
+
collection: item.collection,
|
|
377
|
+
content: item.content,
|
|
378
|
+
distance: 1, // Default distance for keyword-only results
|
|
379
|
+
},
|
|
380
|
+
score: rrfScore,
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
});
|
|
384
|
+
|
|
385
|
+
// Sort by combined RRF score (higher is better)
|
|
386
|
+
const sorted = Array.from(scoreMap.values())
|
|
387
|
+
.sort((a, b) => b.score - a.score)
|
|
388
|
+
.slice(0, limit);
|
|
389
|
+
|
|
390
|
+
return sorted.map(({ item, score }) => ({ ...item, score }));
|
|
391
|
+
};
|
|
392
|
+
|
|
393
|
+
/**
|
|
394
|
+
* Re-rank results using a secondary embedding model for higher precision.
|
|
395
|
+
* Uses cosine similarity with a different model to diversify ranking signals.
|
|
396
|
+
*/
|
|
397
|
+
#rerankResults = async (query: string, results: SearchChunkItem[]): Promise<SearchChunkItem[]> => {
|
|
398
|
+
if (results.length === 0) return results;
|
|
399
|
+
|
|
400
|
+
const reranker = await this.#getReranker();
|
|
401
|
+
|
|
402
|
+
// Get embeddings for query and all result contents
|
|
403
|
+
const queryEmbedding = await reranker(query, { pooling: 'mean', normalize: true });
|
|
404
|
+
const contentEmbeddings = await reranker(
|
|
405
|
+
results.map((r) => r.content),
|
|
406
|
+
{ pooling: 'mean', normalize: true },
|
|
407
|
+
);
|
|
408
|
+
|
|
409
|
+
// Compute cosine similarity scores
|
|
410
|
+
const queryVec = queryEmbedding.tolist()[0];
|
|
411
|
+
const contentVecs = contentEmbeddings.tolist();
|
|
412
|
+
|
|
413
|
+
const scored = results.map((result, i) => {
|
|
414
|
+
const similarity = cos_sim(queryVec, contentVecs[i]);
|
|
415
|
+
return { ...result, score: similarity };
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
// Sort by re-ranker score (higher similarity is better)
|
|
419
|
+
return scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
|
|
420
|
+
};
|
|
421
|
+
|
|
422
|
+
public getDocument = async (collection: string, id: string): Promise<ReferenceDocument | null> => {
|
|
423
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
424
|
+
const database = await databaseService.getInstance();
|
|
425
|
+
|
|
426
|
+
const [document] = await database(tableNames.referenceDocuments)
|
|
427
|
+
.select('collection', 'id', 'content')
|
|
428
|
+
.where({ collection, id })
|
|
429
|
+
.limit(1);
|
|
430
|
+
|
|
431
|
+
if (!document) {
|
|
432
|
+
return null;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
return {
|
|
436
|
+
collection: document.collection,
|
|
437
|
+
id: document.id,
|
|
438
|
+
content: document.content,
|
|
439
|
+
};
|
|
440
|
+
};
|
|
441
|
+
|
|
442
|
+
/**
|
|
443
|
+
* Get all document IDs and hashes in a collection.
|
|
444
|
+
*/
|
|
445
|
+
public getDocumentIds = async (collection: string): Promise<{ id: string; hash: string }[]> => {
|
|
446
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
447
|
+
const database = await databaseService.getInstance();
|
|
448
|
+
|
|
449
|
+
const documents = await database(tableNames.referenceDocuments).select('id', 'hash').where({ collection });
|
|
450
|
+
|
|
451
|
+
return documents.map((doc) => ({ id: doc.id, hash: doc.hash }));
|
|
452
|
+
};
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Delete a specific document from a collection.
|
|
456
|
+
*/
|
|
457
|
+
public deleteDocument = async (collection: string, id: string): Promise<void> => {
|
|
458
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
459
|
+
const database = await databaseService.getInstance();
|
|
460
|
+
|
|
461
|
+
await database.transaction(async (trx) => {
|
|
462
|
+
await trx(tableNames.referenceDocumentChunks).delete().where({
|
|
463
|
+
collection,
|
|
464
|
+
document: id,
|
|
465
|
+
});
|
|
466
|
+
await trx(tableNames.referenceDocumentChunksFts).delete().where({
|
|
467
|
+
collection,
|
|
468
|
+
document: id,
|
|
469
|
+
});
|
|
470
|
+
await trx(tableNames.referenceDocuments).delete().where({
|
|
471
|
+
collection,
|
|
472
|
+
id,
|
|
473
|
+
});
|
|
474
|
+
});
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Delete multiple documents from a collection.
|
|
479
|
+
*/
|
|
480
|
+
public deleteDocuments = async (collection: string, ids: string[]): Promise<void> => {
|
|
481
|
+
if (ids.length === 0) return;
|
|
482
|
+
|
|
483
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
484
|
+
const database = await databaseService.getInstance();
|
|
485
|
+
|
|
486
|
+
await database.transaction(async (trx) => {
|
|
487
|
+
await trx(tableNames.referenceDocumentChunks).delete().where({ collection }).whereIn('document', ids);
|
|
488
|
+
await trx(tableNames.referenceDocumentChunksFts).delete().where({ collection }).whereIn('document', ids);
|
|
489
|
+
await trx(tableNames.referenceDocuments).delete().where({ collection }).whereIn('id', ids);
|
|
490
|
+
});
|
|
491
|
+
};
|
|
492
|
+
|
|
493
|
+
// === New methods for MCP tools v2 ===
|
|
494
|
+
|
|
495
|
+
/**
|
|
496
|
+
* List documents in a collection with pagination.
|
|
497
|
+
*/
|
|
498
|
+
public listDocuments = async (params: ListDocumentsParams): Promise<ListDocumentsResult> => {
|
|
499
|
+
const { collection, limit = 100, offset = 0 } = params;
|
|
500
|
+
|
|
501
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
502
|
+
const database = await databaseService.getInstance();
|
|
503
|
+
|
|
504
|
+
// Get total count
|
|
505
|
+
const [{ count: total }] = await database(tableNames.referenceDocuments).where({ collection }).count('* as count');
|
|
506
|
+
|
|
507
|
+
// Get documents with pagination
|
|
508
|
+
const documents = await database(tableNames.referenceDocuments)
|
|
509
|
+
.select('id', 'content')
|
|
510
|
+
.where({ collection })
|
|
511
|
+
.orderBy('id', 'asc')
|
|
512
|
+
.limit(limit)
|
|
513
|
+
.offset(offset);
|
|
514
|
+
|
|
515
|
+
const documentInfos = documents.map((doc) => ({
|
|
516
|
+
id: doc.id,
|
|
517
|
+
title: this.#extractTitle(doc.content, doc.id),
|
|
518
|
+
size: doc.content.length,
|
|
519
|
+
}));
|
|
520
|
+
|
|
521
|
+
return {
|
|
522
|
+
documents: documentInfos,
|
|
523
|
+
total: Number(total),
|
|
524
|
+
hasMore: offset + documents.length < Number(total),
|
|
525
|
+
};
|
|
526
|
+
};
|
|
527
|
+
|
|
528
|
+
/**
|
|
529
|
+
* Get the outline (heading structure) of a document.
|
|
530
|
+
*/
|
|
531
|
+
public getOutline = async (params: GetOutlineParams): Promise<OutlineResult | null> => {
|
|
532
|
+
const { collection, document: documentId, maxDepth = 3 } = params;
|
|
533
|
+
|
|
534
|
+
const doc = await this.getDocument(collection, documentId);
|
|
535
|
+
if (!doc) {
|
|
536
|
+
return null;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
const title = this.#extractTitle(doc.content, documentId);
|
|
540
|
+
const outline = this.#parseOutline(doc.content, maxDepth);
|
|
541
|
+
|
|
542
|
+
return { title, outline };
|
|
543
|
+
};
|
|
544
|
+
|
|
545
|
+
/**
|
|
546
|
+
* Parse markdown content to extract heading outline.
|
|
547
|
+
*/
|
|
548
|
+
#parseOutline = (content: string, maxDepth: number): OutlineItem[] => {
|
|
549
|
+
const lines = content.split('\n');
|
|
550
|
+
const outline: OutlineItem[] = [];
|
|
551
|
+
|
|
552
|
+
for (let i = 0; i < lines.length; i++) {
|
|
553
|
+
const line = lines[i];
|
|
554
|
+
const match = line.match(/^(#{1,6})\s+(.+)$/);
|
|
555
|
+
if (match) {
|
|
556
|
+
const level = match[1].length;
|
|
557
|
+
if (level <= maxDepth) {
|
|
558
|
+
outline.push({
|
|
559
|
+
level,
|
|
560
|
+
text: match[2].trim(),
|
|
561
|
+
line: i + 1, // 1-indexed line numbers
|
|
562
|
+
});
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
return outline;
|
|
568
|
+
};
|
|
569
|
+
|
|
570
|
+
/**
|
|
571
|
+
* Get a specific section of a document by heading.
|
|
572
|
+
*/
|
|
573
|
+
public getSection = async (params: GetSectionParams): Promise<SectionResult | null> => {
|
|
574
|
+
const { collection, document: documentId, section, includeSubsections = true } = params;
|
|
575
|
+
|
|
576
|
+
const doc = await this.getDocument(collection, documentId);
|
|
577
|
+
if (!doc) {
|
|
578
|
+
return null;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
const lines = doc.content.split('\n');
|
|
582
|
+
let startLine = -1;
|
|
583
|
+
let endLine = lines.length;
|
|
584
|
+
let matchedHeading = '';
|
|
585
|
+
let headingLevel = 0;
|
|
586
|
+
|
|
587
|
+
// Find the section heading (case-insensitive substring match)
|
|
588
|
+
const sectionLower = section.toLowerCase();
|
|
589
|
+
for (let i = 0; i < lines.length; i++) {
|
|
590
|
+
const line = lines[i];
|
|
591
|
+
const match = line.match(/^(#{1,6})\s+(.+)$/);
|
|
592
|
+
if (match) {
|
|
593
|
+
const level = match[1].length;
|
|
594
|
+
const text = match[2].trim();
|
|
595
|
+
|
|
596
|
+
if (startLine === -1) {
|
|
597
|
+
// Looking for the start
|
|
598
|
+
if (text.toLowerCase().includes(sectionLower)) {
|
|
599
|
+
startLine = i;
|
|
600
|
+
matchedHeading = text;
|
|
601
|
+
headingLevel = level;
|
|
602
|
+
}
|
|
603
|
+
} else {
|
|
604
|
+
// Looking for the end
|
|
605
|
+
if (includeSubsections) {
|
|
606
|
+
// Stop at same or higher level heading
|
|
607
|
+
if (level <= headingLevel) {
|
|
608
|
+
endLine = i;
|
|
609
|
+
break;
|
|
610
|
+
}
|
|
611
|
+
} else {
|
|
612
|
+
// Stop at any heading
|
|
613
|
+
endLine = i;
|
|
614
|
+
break;
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
if (startLine === -1) {
|
|
621
|
+
return null;
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
const sectionContent = lines.slice(startLine, endLine).join('\n');
|
|
625
|
+
|
|
626
|
+
return {
|
|
627
|
+
section: matchedHeading,
|
|
628
|
+
level: headingLevel,
|
|
629
|
+
content: sectionContent,
|
|
630
|
+
startLine: startLine + 1, // 1-indexed
|
|
631
|
+
endLine: endLine, // 1-indexed (exclusive)
|
|
632
|
+
};
|
|
633
|
+
};
|
|
634
|
+
|
|
635
|
+
/**
|
|
636
|
+
* Find content related to a document or chunk.
|
|
637
|
+
*/
|
|
638
|
+
public findRelated = async (params: FindRelatedParams): Promise<SearchChunkItem[]> => {
|
|
639
|
+
const { collection, document: documentId, chunk, limit = 5, sameDocument = false } = params;
|
|
640
|
+
|
|
641
|
+
const databaseService = this.#services.get(DatabaseService);
|
|
642
|
+
const database = await databaseService.getInstance();
|
|
643
|
+
const embedder = this.#services.get(EmbedderService);
|
|
644
|
+
|
|
645
|
+
let queryEmbedding: number[];
|
|
646
|
+
|
|
647
|
+
if (chunk) {
|
|
648
|
+
// Embed the provided chunk
|
|
649
|
+
queryEmbedding = await embedder.createQueryEmbedding(chunk);
|
|
650
|
+
} else {
|
|
651
|
+
// Compute centroid of document's chunk embeddings
|
|
652
|
+
const chunks = await database(tableNames.referenceDocumentChunks)
|
|
653
|
+
.select('embedding')
|
|
654
|
+
.where({ collection, document: documentId });
|
|
655
|
+
|
|
656
|
+
if (chunks.length === 0) {
|
|
657
|
+
return [];
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
// Parse embeddings and compute mean
|
|
661
|
+
const embeddings = chunks.map((c) => JSON.parse(c.embedding) as number[]);
|
|
662
|
+
const dimensions = embeddings[0].length;
|
|
663
|
+
const centroid = new Array(dimensions).fill(0);
|
|
664
|
+
|
|
665
|
+
for (const emb of embeddings) {
|
|
666
|
+
for (let i = 0; i < dimensions; i++) {
|
|
667
|
+
centroid[i] += emb[i];
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
for (let i = 0; i < dimensions; i++) {
|
|
672
|
+
centroid[i] /= embeddings.length;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
queryEmbedding = centroid;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// Search for similar chunks
|
|
679
|
+
let query = database(tableNames.referenceDocumentChunks)
|
|
680
|
+
.select('id', 'collection', 'document', 'content')
|
|
681
|
+
.select(database.raw('vec_distance_cosine(?, embedding) as distance', [JSON.stringify(queryEmbedding)]));
|
|
682
|
+
|
|
683
|
+
// Exclude source document unless sameDocument is true
|
|
684
|
+
if (!sameDocument) {
|
|
685
|
+
// Use explicit whereNot with function to ensure correct SQL generation
|
|
686
|
+
query = query.whereNot(function () {
|
|
687
|
+
this.where('collection', collection).andWhere('document', documentId);
|
|
688
|
+
});
|
|
689
|
+
}
|
|
690
|
+
// When sameDocument is true, we include all chunks (no exclusion)
|
|
691
|
+
|
|
692
|
+
query = query.orderBy('distance', 'asc').limit(limit);
|
|
693
|
+
|
|
694
|
+
const results = await query;
|
|
695
|
+
|
|
696
|
+
return results.map((row) => ({
|
|
697
|
+
id: row.id,
|
|
698
|
+
document: row.document,
|
|
699
|
+
collection: row.collection,
|
|
700
|
+
content: row.content,
|
|
701
|
+
distance: row.distance,
|
|
702
|
+
score: 1 - row.distance, // Convert distance to similarity score
|
|
703
|
+
}));
|
|
704
|
+
};
|
|
705
|
+
|
|
706
|
+
/**
|
|
707
|
+
* Execute multiple search queries in batch.
|
|
708
|
+
*/
|
|
709
|
+
public searchBatch = async (params: SearchBatchParams): Promise<SearchBatchResult> => {
|
|
710
|
+
const { queries, limit = 5, maxDistance, hybridSearch = true } = params;
|
|
711
|
+
|
|
712
|
+
const results = [];
|
|
713
|
+
|
|
714
|
+
for (const q of queries) {
|
|
715
|
+
const searchResults = await this.search({
|
|
716
|
+
query: q.query,
|
|
717
|
+
collections: q.collections,
|
|
718
|
+
limit,
|
|
719
|
+
maxDistance,
|
|
720
|
+
hybridSearch,
|
|
721
|
+
rerank: false, // Don't rerank in batch for performance
|
|
722
|
+
});
|
|
723
|
+
|
|
724
|
+
results.push({
|
|
725
|
+
query: q.query,
|
|
726
|
+
results: searchResults,
|
|
727
|
+
});
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
return { results };
|
|
731
|
+
};
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
export { DocumentsService };
|