ctxpkg 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/LICENSE +661 -0
  2. package/README.md +282 -0
  3. package/bin/cli.js +8 -0
  4. package/bin/daemon.js +7 -0
  5. package/package.json +70 -0
  6. package/src/agent/AGENTS.md +249 -0
  7. package/src/agent/agent.prompts.ts +66 -0
  8. package/src/agent/agent.test-runner.schemas.ts +158 -0
  9. package/src/agent/agent.test-runner.ts +436 -0
  10. package/src/agent/agent.ts +371 -0
  11. package/src/agent/agent.types.ts +94 -0
  12. package/src/backend/AGENTS.md +112 -0
  13. package/src/backend/backend.protocol.ts +95 -0
  14. package/src/backend/backend.schemas.ts +123 -0
  15. package/src/backend/backend.services.ts +151 -0
  16. package/src/backend/backend.ts +111 -0
  17. package/src/backend/backend.types.ts +34 -0
  18. package/src/cli/AGENTS.md +213 -0
  19. package/src/cli/cli.agent.ts +197 -0
  20. package/src/cli/cli.chat.ts +369 -0
  21. package/src/cli/cli.client.ts +55 -0
  22. package/src/cli/cli.collections.ts +491 -0
  23. package/src/cli/cli.config.ts +252 -0
  24. package/src/cli/cli.daemon.ts +160 -0
  25. package/src/cli/cli.documents.ts +413 -0
  26. package/src/cli/cli.mcp.ts +177 -0
  27. package/src/cli/cli.ts +28 -0
  28. package/src/cli/cli.utils.ts +122 -0
  29. package/src/client/AGENTS.md +135 -0
  30. package/src/client/client.adapters.ts +279 -0
  31. package/src/client/client.ts +86 -0
  32. package/src/client/client.types.ts +17 -0
  33. package/src/collections/AGENTS.md +185 -0
  34. package/src/collections/collections.schemas.ts +195 -0
  35. package/src/collections/collections.ts +1160 -0
  36. package/src/config/config.ts +118 -0
  37. package/src/daemon/AGENTS.md +168 -0
  38. package/src/daemon/daemon.config.ts +23 -0
  39. package/src/daemon/daemon.manager.ts +215 -0
  40. package/src/daemon/daemon.schemas.ts +22 -0
  41. package/src/daemon/daemon.ts +205 -0
  42. package/src/database/AGENTS.md +211 -0
  43. package/src/database/database.ts +64 -0
  44. package/src/database/migrations/migrations.001-init.ts +56 -0
  45. package/src/database/migrations/migrations.002-fts5.ts +32 -0
  46. package/src/database/migrations/migrations.ts +20 -0
  47. package/src/database/migrations/migrations.types.ts +9 -0
  48. package/src/documents/AGENTS.md +301 -0
  49. package/src/documents/documents.schemas.ts +190 -0
  50. package/src/documents/documents.ts +734 -0
  51. package/src/embedder/embedder.ts +53 -0
  52. package/src/exports.ts +0 -0
  53. package/src/mcp/AGENTS.md +264 -0
  54. package/src/mcp/mcp.ts +105 -0
  55. package/src/tools/AGENTS.md +228 -0
  56. package/src/tools/agent/agent.ts +45 -0
  57. package/src/tools/documents/documents.ts +401 -0
  58. package/src/tools/tools.langchain.ts +37 -0
  59. package/src/tools/tools.mcp.ts +46 -0
  60. package/src/tools/tools.types.ts +35 -0
  61. package/src/utils/utils.services.ts +46 -0
@@ -0,0 +1,734 @@
1
+ import { createHash, randomUUID } from 'node:crypto';
2
+ import { glob, readFile } from 'node:fs/promises';
3
+ import { resolve } from 'node:path';
4
+
5
+ import { TokenTextSplitter } from '@langchain/textsplitters';
6
+ import { type FeatureExtractionPipeline, pipeline, cos_sim } from '@huggingface/transformers';
7
+
8
+ import {
9
+ searchChunkItemSchema,
10
+ type ReferenceDocument,
11
+ type SearchChunksOptions,
12
+ type SearchChunkItem,
13
+ type ListDocumentsParams,
14
+ type ListDocumentsResult,
15
+ type GetOutlineParams,
16
+ type OutlineResult,
17
+ type OutlineItem,
18
+ type GetSectionParams,
19
+ type SectionResult,
20
+ type FindRelatedParams,
21
+ type SearchBatchParams,
22
+ type SearchBatchResult,
23
+ } from './documents.schemas.ts';
24
+
25
+ import type { Services } from '#root/utils/utils.services.ts';
26
+ import { DatabaseService, tableNames } from '#root/database/database.ts';
27
+ import { EmbedderService } from '#root/embedder/embedder.ts';
28
+
29
+ // Chunking configuration
30
+ const CHUNK_SIZE = 400;
31
+ const CHUNK_OVERLAP = 80;
32
+
33
+ // Search configuration
34
+ const RRF_K = 60; // Reciprocal Rank Fusion constant
35
+ const RERANK_CANDIDATES_MULTIPLIER = 3; // Fetch 3x candidates for re-ranking
36
+
37
+ class DocumentsService {
38
+ #services: Services;
39
+ #reranker?: Promise<FeatureExtractionPipeline>;
40
+
41
+ constructor(services: Services) {
42
+ this.#services = services;
43
+ }
44
+
45
+ /**
46
+ * Lazily initialize the re-ranker model.
47
+ * Uses a smaller, faster model for re-ranking candidates.
48
+ */
49
+ #getReranker = async (): Promise<FeatureExtractionPipeline> => {
50
+ if (!this.#reranker) {
51
+ // Use a smaller model for fast re-ranking
52
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
53
+ const loadPipeline = pipeline as any;
54
+ this.#reranker = loadPipeline(
55
+ 'feature-extraction',
56
+ 'Xenova/all-MiniLM-L6-v2',
57
+ ) as Promise<FeatureExtractionPipeline>;
58
+ }
59
+ return this.#reranker;
60
+ };
61
+
62
+ /**
63
+ * Extract document title from markdown content.
64
+ */
65
+ #extractTitle = (content: string, fallback: string): string => {
66
+ const titleMatch = content.match(/^#\s+(.+)$/m);
67
+ return titleMatch?.[1]?.trim() || fallback;
68
+ };
69
+
70
+ /**
71
+ * Extract the nearest preceding heading for a chunk position.
72
+ */
73
+ #extractSectionHeading = (content: string, chunkStart: number): string | null => {
74
+ const beforeChunk = content.slice(0, chunkStart);
75
+ const headings = beforeChunk.match(/^#{1,6}\s+.+$/gm);
76
+ return headings?.[headings.length - 1]?.replace(/^#+\s+/, '') || null;
77
+ };
78
+
79
+ public listCollections = async () => {
80
+ const databaseService = this.#services.get(DatabaseService);
81
+ const database = await databaseService.getInstance();
82
+
83
+ // Get document counts per collection
84
+ const docCounts = await database(tableNames.referenceDocuments)
85
+ .select('collection', database.raw('COUNT(*) as document_count'))
86
+ .groupBy('collection')
87
+ .orderBy('collection', 'asc');
88
+
89
+ // Get collection metadata (description, version) from collections table
90
+ const collectionIds = docCounts.map((c) => c.collection);
91
+ const collectionMeta = await database(tableNames.collections)
92
+ .select('id', 'description', 'version')
93
+ .whereIn('id', collectionIds);
94
+
95
+ // Build a map of collection ID -> metadata
96
+ const metaMap = new Map<string, { description: string | null; version: string | null }>();
97
+ for (const meta of collectionMeta) {
98
+ metaMap.set(meta.id, { description: meta.description, version: meta.version });
99
+ }
100
+
101
+ // Merge counts with metadata
102
+ return docCounts.map((c) => {
103
+ const meta = metaMap.get(c.collection);
104
+ return {
105
+ collection: c.collection,
106
+ document_count: c.document_count,
107
+ description: meta?.description ?? null,
108
+ version: meta?.version ?? null,
109
+ };
110
+ });
111
+ };
112
+
113
+ public dropCollection = async (collection: string) => {
114
+ const databaseService = this.#services.get(DatabaseService);
115
+ const database = await databaseService.getInstance();
116
+
117
+ await database.transaction(async (trx) => {
118
+ await trx(tableNames.referenceDocumentChunks).delete().where({ collection });
119
+ await trx(tableNames.referenceDocumentChunksFts).delete().where({ collection });
120
+ await trx(tableNames.referenceDocuments).delete().where({ collection });
121
+ });
122
+ };
123
+
124
+ public updateCollectionFromGlob = async (options: { pattern: string; cwd: string; collection?: string }) => {
125
+ const { pattern, collection, cwd } = options;
126
+ for await (const file of glob(pattern, { cwd })) {
127
+ const fullPath = resolve(cwd, file);
128
+ const content = await readFile(fullPath, 'utf8');
129
+ await this.updateDocument({
130
+ collection: collection || cwd,
131
+ id: file,
132
+ content,
133
+ });
134
+ }
135
+ };
136
+
137
+ public updateDocument = async (document: ReferenceDocument) => {
138
+ const databaseService = this.#services.get(DatabaseService);
139
+ const database = await databaseService.getInstance();
140
+ const hash = createHash('sha256').update(document.content).digest('hex');
141
+ const [current] = await database(tableNames.referenceDocuments)
142
+ .where({ collection: document.collection, id: document.id })
143
+ .limit(1);
144
+
145
+ if (current && current.hash === hash) {
146
+ return;
147
+ }
148
+
149
+ await database.transaction(async (trx) => {
150
+ // Clean up existing chunks (both vector and FTS)
151
+ if (current) {
152
+ await trx(tableNames.referenceDocumentChunks).delete().where({
153
+ collection: document.collection,
154
+ document: document.id,
155
+ });
156
+ await trx(tableNames.referenceDocumentChunksFts).delete().where({
157
+ collection: document.collection,
158
+ document: document.id,
159
+ });
160
+ await trx(tableNames.referenceDocuments)
161
+ .update({
162
+ hash,
163
+ content: document.content,
164
+ })
165
+ .where({
166
+ collection: document.collection,
167
+ id: document.id,
168
+ });
169
+ } else {
170
+ await trx(tableNames.referenceDocuments).insert({
171
+ collection: document.collection,
172
+ id: document.id,
173
+ hash,
174
+ content: document.content,
175
+ });
176
+ }
177
+
178
+ // Create chunks with improved settings
179
+ const splitter = new TokenTextSplitter({
180
+ encodingName: 'cl100k_base',
181
+ chunkSize: CHUNK_SIZE,
182
+ chunkOverlap: CHUNK_OVERLAP,
183
+ });
184
+ const chunks = (await splitter.createDocuments([document.content])) as {
185
+ pageContent: string;
186
+ metadata: { loc: { lines: { from: number; to: number } } };
187
+ }[];
188
+
189
+ // Extract document title for context
190
+ const title = this.#extractTitle(document.content, document.id);
191
+
192
+ // Create contextualized chunks with document and section context
193
+ const contextualizedChunks = chunks.map((chunk) => {
194
+ // Find the character position approximately (using line info if available)
195
+ const lines = document.content.split('\n');
196
+ let charPos = 0;
197
+ const startLine = chunk.metadata?.loc?.lines?.from ?? 0;
198
+ for (let i = 0; i < startLine && i < lines.length; i++) {
199
+ charPos += lines[i].length + 1;
200
+ }
201
+
202
+ const sectionHeading = this.#extractSectionHeading(document.content, charPos);
203
+
204
+ // Build context prefix
205
+ let contextPrefix = `Document: ${title}`;
206
+ if (sectionHeading && sectionHeading !== title) {
207
+ contextPrefix += `\nSection: ${sectionHeading}`;
208
+ }
209
+
210
+ return {
211
+ // Embed with context for better semantic understanding
212
+ textForEmbedding: `${contextPrefix}\n\n${chunk.pageContent}`,
213
+ // Store original content for display
214
+ originalContent: chunk.pageContent,
215
+ };
216
+ });
217
+
218
+ // Create embeddings using document embedding method (no query instruction)
219
+ const embedder = this.#services.get(EmbedderService);
220
+ const embeddings = await embedder.createDocumentEmbeddings(contextualizedChunks.map((c) => c.textForEmbedding));
221
+
222
+ // Insert chunks into vector table
223
+ const chunkRecords = embeddings.map((embedding, i) => ({
224
+ id: randomUUID(),
225
+ collection: document.collection,
226
+ document: document.id,
227
+ content: contextualizedChunks[i].originalContent,
228
+ embedding: JSON.stringify(embedding),
229
+ }));
230
+
231
+ await trx(tableNames.referenceDocumentChunks).insert(chunkRecords);
232
+
233
+ // Insert into FTS5 table for hybrid search
234
+ await trx(tableNames.referenceDocumentChunksFts).insert(
235
+ chunkRecords.map((record) => ({
236
+ id: record.id,
237
+ collection: record.collection,
238
+ document: record.document,
239
+ content: record.content,
240
+ })),
241
+ );
242
+ });
243
+ };
244
+
245
+ public search = async (options: SearchChunksOptions): Promise<SearchChunkItem[]> => {
246
+ const { query, collections, limit = 10, maxDistance, hybridSearch = true, rerank = false } = options;
247
+
248
+ const databaseService = this.#services.get(DatabaseService);
249
+ const database = await databaseService.getInstance();
250
+
251
+ // Determine how many candidates to fetch
252
+ const candidateLimit = rerank ? limit * RERANK_CANDIDATES_MULTIPLIER : limit;
253
+
254
+ // 1. Vector similarity search using cosine distance
255
+ const embedder = this.#services.get(EmbedderService);
256
+ const queryEmbedding = await embedder.createQueryEmbedding(query);
257
+
258
+ // Build vector search query
259
+ // Note: We use a subquery to filter by computed distance since SQLite
260
+ // doesn't support HAVING without GROUP BY
261
+ let vectorQuery = database(tableNames.referenceDocumentChunks)
262
+ .select('id', 'collection', 'document', 'content')
263
+ .select(database.raw('vec_distance_cosine(?, embedding) as distance', [JSON.stringify(queryEmbedding)]));
264
+
265
+ if (collections) {
266
+ vectorQuery = vectorQuery.whereIn('collection', collections);
267
+ }
268
+
269
+ vectorQuery = vectorQuery.orderBy('distance', 'asc').limit(candidateLimit);
270
+
271
+ let vectorResults = await vectorQuery;
272
+
273
+ // Filter by maxDistance if specified (done in JS since SQLite doesn't support HAVING on computed columns)
274
+ if (maxDistance !== undefined) {
275
+ vectorResults = vectorResults.filter((row) => row.distance <= maxDistance);
276
+ }
277
+
278
+ // 2. Keyword search using FTS5 (if hybrid search enabled)
279
+ let keywordResults: { id: string; collection: string; document: string; content: string; rank: number }[] = [];
280
+
281
+ if (hybridSearch) {
282
+ // Escape special FTS5 characters and create search query
283
+ const ftsQuery = query
284
+ .replace(/['"(){}[\]*:^~\\]/g, ' ')
285
+ .split(/\s+/)
286
+ .filter((term) => term.length > 0)
287
+ .map((term) => `"${term}"`)
288
+ .join(' OR ');
289
+
290
+ if (ftsQuery) {
291
+ let ftsDbQuery = database(tableNames.referenceDocumentChunksFts)
292
+ .select('id', 'collection', 'document', 'content')
293
+ .select(database.raw('rank as rank'))
294
+ .whereRaw(`${tableNames.referenceDocumentChunksFts} MATCH ?`, [ftsQuery]);
295
+
296
+ if (collections) {
297
+ ftsDbQuery = ftsDbQuery.whereIn('collection', collections);
298
+ }
299
+
300
+ ftsDbQuery = ftsDbQuery.orderBy('rank', 'asc').limit(candidateLimit);
301
+
302
+ try {
303
+ keywordResults = await ftsDbQuery;
304
+ } catch {
305
+ // FTS query might fail for edge cases, fall back to vector-only
306
+ keywordResults = [];
307
+ }
308
+ }
309
+ }
310
+
311
+ // 3. Merge results using Reciprocal Rank Fusion (RRF)
312
+ let mergedResults: SearchChunkItem[];
313
+
314
+ if (hybridSearch && keywordResults.length > 0) {
315
+ mergedResults = this.#reciprocalRankFusion(vectorResults, keywordResults, candidateLimit);
316
+ } else {
317
+ // Vector-only results
318
+ mergedResults = vectorResults.map((row) => ({
319
+ id: row.id,
320
+ document: row.document,
321
+ collection: row.collection,
322
+ content: row.content,
323
+ distance: row.distance,
324
+ score: 1 / (RRF_K + 1), // Single source score
325
+ }));
326
+ }
327
+
328
+ // 4. Re-rank using cross-encoder (if enabled)
329
+ if (rerank && mergedResults.length > 0) {
330
+ mergedResults = await this.#rerankResults(query, mergedResults);
331
+ }
332
+
333
+ // 5. Apply final limit and return
334
+ return mergedResults.slice(0, limit).map((row) => searchChunkItemSchema.parse(row));
335
+ };
336
+
337
+ /**
338
+ * Merge vector and keyword search results using Reciprocal Rank Fusion.
339
+ * RRF score = sum(1 / (k + rank)) for each result across all rankings.
340
+ */
341
+ #reciprocalRankFusion = (
342
+ vectorResults: { id: string; collection: string; document: string; content: string; distance: number }[],
343
+ keywordResults: { id: string; collection: string; document: string; content: string; rank: number }[],
344
+ limit: number,
345
+ ): SearchChunkItem[] => {
346
+ const scoreMap = new Map<string, { item: SearchChunkItem; score: number }>();
347
+
348
+ // Add vector results with RRF scores
349
+ vectorResults.forEach((item, rank) => {
350
+ const rrfScore = 1 / (RRF_K + rank + 1);
351
+ scoreMap.set(item.id, {
352
+ item: {
353
+ id: item.id,
354
+ document: item.document,
355
+ collection: item.collection,
356
+ content: item.content,
357
+ distance: item.distance,
358
+ },
359
+ score: rrfScore,
360
+ });
361
+ });
362
+
363
+ // Add keyword results with RRF scores
364
+ keywordResults.forEach((item, rank) => {
365
+ const rrfScore = 1 / (RRF_K + rank + 1);
366
+ const existing = scoreMap.get(item.id);
367
+
368
+ if (existing) {
369
+ // Combine scores if item appears in both result sets
370
+ existing.score += rrfScore;
371
+ } else {
372
+ scoreMap.set(item.id, {
373
+ item: {
374
+ id: item.id,
375
+ document: item.document,
376
+ collection: item.collection,
377
+ content: item.content,
378
+ distance: 1, // Default distance for keyword-only results
379
+ },
380
+ score: rrfScore,
381
+ });
382
+ }
383
+ });
384
+
385
+ // Sort by combined RRF score (higher is better)
386
+ const sorted = Array.from(scoreMap.values())
387
+ .sort((a, b) => b.score - a.score)
388
+ .slice(0, limit);
389
+
390
+ return sorted.map(({ item, score }) => ({ ...item, score }));
391
+ };
392
+
393
+ /**
394
+ * Re-rank results using a secondary embedding model for higher precision.
395
+ * Uses cosine similarity with a different model to diversify ranking signals.
396
+ */
397
+ #rerankResults = async (query: string, results: SearchChunkItem[]): Promise<SearchChunkItem[]> => {
398
+ if (results.length === 0) return results;
399
+
400
+ const reranker = await this.#getReranker();
401
+
402
+ // Get embeddings for query and all result contents
403
+ const queryEmbedding = await reranker(query, { pooling: 'mean', normalize: true });
404
+ const contentEmbeddings = await reranker(
405
+ results.map((r) => r.content),
406
+ { pooling: 'mean', normalize: true },
407
+ );
408
+
409
+ // Compute cosine similarity scores
410
+ const queryVec = queryEmbedding.tolist()[0];
411
+ const contentVecs = contentEmbeddings.tolist();
412
+
413
+ const scored = results.map((result, i) => {
414
+ const similarity = cos_sim(queryVec, contentVecs[i]);
415
+ return { ...result, score: similarity };
416
+ });
417
+
418
+ // Sort by re-ranker score (higher similarity is better)
419
+ return scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
420
+ };
421
+
422
+ public getDocument = async (collection: string, id: string): Promise<ReferenceDocument | null> => {
423
+ const databaseService = this.#services.get(DatabaseService);
424
+ const database = await databaseService.getInstance();
425
+
426
+ const [document] = await database(tableNames.referenceDocuments)
427
+ .select('collection', 'id', 'content')
428
+ .where({ collection, id })
429
+ .limit(1);
430
+
431
+ if (!document) {
432
+ return null;
433
+ }
434
+
435
+ return {
436
+ collection: document.collection,
437
+ id: document.id,
438
+ content: document.content,
439
+ };
440
+ };
441
+
442
+ /**
443
+ * Get all document IDs and hashes in a collection.
444
+ */
445
+ public getDocumentIds = async (collection: string): Promise<{ id: string; hash: string }[]> => {
446
+ const databaseService = this.#services.get(DatabaseService);
447
+ const database = await databaseService.getInstance();
448
+
449
+ const documents = await database(tableNames.referenceDocuments).select('id', 'hash').where({ collection });
450
+
451
+ return documents.map((doc) => ({ id: doc.id, hash: doc.hash }));
452
+ };
453
+
454
+ /**
455
+ * Delete a specific document from a collection.
456
+ */
457
+ public deleteDocument = async (collection: string, id: string): Promise<void> => {
458
+ const databaseService = this.#services.get(DatabaseService);
459
+ const database = await databaseService.getInstance();
460
+
461
+ await database.transaction(async (trx) => {
462
+ await trx(tableNames.referenceDocumentChunks).delete().where({
463
+ collection,
464
+ document: id,
465
+ });
466
+ await trx(tableNames.referenceDocumentChunksFts).delete().where({
467
+ collection,
468
+ document: id,
469
+ });
470
+ await trx(tableNames.referenceDocuments).delete().where({
471
+ collection,
472
+ id,
473
+ });
474
+ });
475
+ };
476
+
477
+ /**
478
+ * Delete multiple documents from a collection.
479
+ */
480
+ public deleteDocuments = async (collection: string, ids: string[]): Promise<void> => {
481
+ if (ids.length === 0) return;
482
+
483
+ const databaseService = this.#services.get(DatabaseService);
484
+ const database = await databaseService.getInstance();
485
+
486
+ await database.transaction(async (trx) => {
487
+ await trx(tableNames.referenceDocumentChunks).delete().where({ collection }).whereIn('document', ids);
488
+ await trx(tableNames.referenceDocumentChunksFts).delete().where({ collection }).whereIn('document', ids);
489
+ await trx(tableNames.referenceDocuments).delete().where({ collection }).whereIn('id', ids);
490
+ });
491
+ };
492
+
493
+ // === New methods for MCP tools v2 ===
494
+
495
+ /**
496
+ * List documents in a collection with pagination.
497
+ */
498
+ public listDocuments = async (params: ListDocumentsParams): Promise<ListDocumentsResult> => {
499
+ const { collection, limit = 100, offset = 0 } = params;
500
+
501
+ const databaseService = this.#services.get(DatabaseService);
502
+ const database = await databaseService.getInstance();
503
+
504
+ // Get total count
505
+ const [{ count: total }] = await database(tableNames.referenceDocuments).where({ collection }).count('* as count');
506
+
507
+ // Get documents with pagination
508
+ const documents = await database(tableNames.referenceDocuments)
509
+ .select('id', 'content')
510
+ .where({ collection })
511
+ .orderBy('id', 'asc')
512
+ .limit(limit)
513
+ .offset(offset);
514
+
515
+ const documentInfos = documents.map((doc) => ({
516
+ id: doc.id,
517
+ title: this.#extractTitle(doc.content, doc.id),
518
+ size: doc.content.length,
519
+ }));
520
+
521
+ return {
522
+ documents: documentInfos,
523
+ total: Number(total),
524
+ hasMore: offset + documents.length < Number(total),
525
+ };
526
+ };
527
+
528
+ /**
529
+ * Get the outline (heading structure) of a document.
530
+ */
531
+ public getOutline = async (params: GetOutlineParams): Promise<OutlineResult | null> => {
532
+ const { collection, document: documentId, maxDepth = 3 } = params;
533
+
534
+ const doc = await this.getDocument(collection, documentId);
535
+ if (!doc) {
536
+ return null;
537
+ }
538
+
539
+ const title = this.#extractTitle(doc.content, documentId);
540
+ const outline = this.#parseOutline(doc.content, maxDepth);
541
+
542
+ return { title, outline };
543
+ };
544
+
545
+ /**
546
+ * Parse markdown content to extract heading outline.
547
+ */
548
+ #parseOutline = (content: string, maxDepth: number): OutlineItem[] => {
549
+ const lines = content.split('\n');
550
+ const outline: OutlineItem[] = [];
551
+
552
+ for (let i = 0; i < lines.length; i++) {
553
+ const line = lines[i];
554
+ const match = line.match(/^(#{1,6})\s+(.+)$/);
555
+ if (match) {
556
+ const level = match[1].length;
557
+ if (level <= maxDepth) {
558
+ outline.push({
559
+ level,
560
+ text: match[2].trim(),
561
+ line: i + 1, // 1-indexed line numbers
562
+ });
563
+ }
564
+ }
565
+ }
566
+
567
+ return outline;
568
+ };
569
+
570
+ /**
571
+ * Get a specific section of a document by heading.
572
+ */
573
+ public getSection = async (params: GetSectionParams): Promise<SectionResult | null> => {
574
+ const { collection, document: documentId, section, includeSubsections = true } = params;
575
+
576
+ const doc = await this.getDocument(collection, documentId);
577
+ if (!doc) {
578
+ return null;
579
+ }
580
+
581
+ const lines = doc.content.split('\n');
582
+ let startLine = -1;
583
+ let endLine = lines.length;
584
+ let matchedHeading = '';
585
+ let headingLevel = 0;
586
+
587
+ // Find the section heading (case-insensitive substring match)
588
+ const sectionLower = section.toLowerCase();
589
+ for (let i = 0; i < lines.length; i++) {
590
+ const line = lines[i];
591
+ const match = line.match(/^(#{1,6})\s+(.+)$/);
592
+ if (match) {
593
+ const level = match[1].length;
594
+ const text = match[2].trim();
595
+
596
+ if (startLine === -1) {
597
+ // Looking for the start
598
+ if (text.toLowerCase().includes(sectionLower)) {
599
+ startLine = i;
600
+ matchedHeading = text;
601
+ headingLevel = level;
602
+ }
603
+ } else {
604
+ // Looking for the end
605
+ if (includeSubsections) {
606
+ // Stop at same or higher level heading
607
+ if (level <= headingLevel) {
608
+ endLine = i;
609
+ break;
610
+ }
611
+ } else {
612
+ // Stop at any heading
613
+ endLine = i;
614
+ break;
615
+ }
616
+ }
617
+ }
618
+ }
619
+
620
+ if (startLine === -1) {
621
+ return null;
622
+ }
623
+
624
+ const sectionContent = lines.slice(startLine, endLine).join('\n');
625
+
626
+ return {
627
+ section: matchedHeading,
628
+ level: headingLevel,
629
+ content: sectionContent,
630
+ startLine: startLine + 1, // 1-indexed
631
+ endLine: endLine, // 1-indexed (exclusive)
632
+ };
633
+ };
634
+
635
+ /**
636
+ * Find content related to a document or chunk.
637
+ */
638
+ public findRelated = async (params: FindRelatedParams): Promise<SearchChunkItem[]> => {
639
+ const { collection, document: documentId, chunk, limit = 5, sameDocument = false } = params;
640
+
641
+ const databaseService = this.#services.get(DatabaseService);
642
+ const database = await databaseService.getInstance();
643
+ const embedder = this.#services.get(EmbedderService);
644
+
645
+ let queryEmbedding: number[];
646
+
647
+ if (chunk) {
648
+ // Embed the provided chunk
649
+ queryEmbedding = await embedder.createQueryEmbedding(chunk);
650
+ } else {
651
+ // Compute centroid of document's chunk embeddings
652
+ const chunks = await database(tableNames.referenceDocumentChunks)
653
+ .select('embedding')
654
+ .where({ collection, document: documentId });
655
+
656
+ if (chunks.length === 0) {
657
+ return [];
658
+ }
659
+
660
+ // Parse embeddings and compute mean
661
+ const embeddings = chunks.map((c) => JSON.parse(c.embedding) as number[]);
662
+ const dimensions = embeddings[0].length;
663
+ const centroid = new Array(dimensions).fill(0);
664
+
665
+ for (const emb of embeddings) {
666
+ for (let i = 0; i < dimensions; i++) {
667
+ centroid[i] += emb[i];
668
+ }
669
+ }
670
+
671
+ for (let i = 0; i < dimensions; i++) {
672
+ centroid[i] /= embeddings.length;
673
+ }
674
+
675
+ queryEmbedding = centroid;
676
+ }
677
+
678
+ // Search for similar chunks
679
+ let query = database(tableNames.referenceDocumentChunks)
680
+ .select('id', 'collection', 'document', 'content')
681
+ .select(database.raw('vec_distance_cosine(?, embedding) as distance', [JSON.stringify(queryEmbedding)]));
682
+
683
+ // Exclude source document unless sameDocument is true
684
+ if (!sameDocument) {
685
+ // Use explicit whereNot with function to ensure correct SQL generation
686
+ query = query.whereNot(function () {
687
+ this.where('collection', collection).andWhere('document', documentId);
688
+ });
689
+ }
690
+ // When sameDocument is true, we include all chunks (no exclusion)
691
+
692
+ query = query.orderBy('distance', 'asc').limit(limit);
693
+
694
+ const results = await query;
695
+
696
+ return results.map((row) => ({
697
+ id: row.id,
698
+ document: row.document,
699
+ collection: row.collection,
700
+ content: row.content,
701
+ distance: row.distance,
702
+ score: 1 - row.distance, // Convert distance to similarity score
703
+ }));
704
+ };
705
+
706
+ /**
707
+ * Execute multiple search queries in batch.
708
+ */
709
+ public searchBatch = async (params: SearchBatchParams): Promise<SearchBatchResult> => {
710
+ const { queries, limit = 5, maxDistance, hybridSearch = true } = params;
711
+
712
+ const results = [];
713
+
714
+ for (const q of queries) {
715
+ const searchResults = await this.search({
716
+ query: q.query,
717
+ collections: q.collections,
718
+ limit,
719
+ maxDistance,
720
+ hybridSearch,
721
+ rerank: false, // Don't rerank in batch for performance
722
+ });
723
+
724
+ results.push({
725
+ query: q.query,
726
+ results: searchResults,
727
+ });
728
+ }
729
+
730
+ return { results };
731
+ };
732
+ }
733
+
734
+ export { DocumentsService };