@arabold/docs-mcp-server 1.18.0 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1917 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import envPaths from "env-paths";
4
+ import Fuse from "fuse.js";
5
+ import semver__default from "semver";
6
+ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
7
+ import remarkGfm from "remark-gfm";
8
+ import remarkHtml from "remark-html";
9
+ import remarkParse from "remark-parse";
10
+ import TurndownService from "turndown";
11
+ import { unified } from "unified";
12
+ import { l as logger, c as createJSDOM, V as VECTOR_DIMENSION, S as StoreError, D as DimensionError, a as applyMigrations, C as ConnectionError, d as denormalizeVersionName, n as normalizeVersionName, E as EMBEDDING_BATCH_CHARS, b as EMBEDDING_BATCH_SIZE, m as mapDbDocumentToDocument, g as getProjectRoot, e as SPLITTER_PREFERRED_CHUNK_SIZE, f as SPLITTER_MAX_CHUNK_SIZE, L as LibraryNotFoundError, h as VersionNotFoundError, i as SPLITTER_MIN_CHUNK_SIZE } from "./index.js";
13
+ import "cheerio";
14
+ import "node:vm";
15
+ import "jsdom";
16
+ import "playwright";
17
+ import "@joplin/turndown-plugin-gfm";
18
+ import "iconv-lite";
19
+ import Database from "better-sqlite3";
20
+ import * as sqliteVec from "sqlite-vec";
21
+ class SplitterError extends Error {
22
+ }
23
+ class MinimumChunkSizeError extends SplitterError {
24
+ constructor(size, maxSize) {
25
+ super(
26
+ `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
27
+ );
28
+ }
29
+ }
30
+ class ContentSplitterError extends SplitterError {
31
+ }
32
+ class GreedySplitter {
33
+ baseSplitter;
34
+ minChunkSize;
35
+ preferredChunkSize;
36
+ /**
37
+ * Combines a base document splitter with size constraints to produce optimally-sized chunks.
38
+ * The base splitter handles the initial semantic splitting, while this class handles
39
+ * the concatenation strategy.
40
+ */
41
+ constructor(baseSplitter, minChunkSize, preferredChunkSize) {
42
+ this.baseSplitter = baseSplitter;
43
+ this.minChunkSize = minChunkSize;
44
+ this.preferredChunkSize = preferredChunkSize;
45
+ }
46
+ /**
47
+ * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
48
+ * are combined until they reach the minimum size, but splits are preserved at major
49
+ * section boundaries to maintain document structure. This balances the need for
50
+ * context with semantic coherence.
51
+ */
52
+ async splitText(markdown) {
53
+ const initialChunks = await this.baseSplitter.splitText(markdown);
54
+ const concatenatedChunks = [];
55
+ let currentChunk = null;
56
+ for (const nextChunk of initialChunks) {
57
+ if (currentChunk) {
58
+ if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
59
+ concatenatedChunks.push(currentChunk);
60
+ currentChunk = this.cloneChunk(nextChunk);
61
+ continue;
62
+ }
63
+ if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
64
+ concatenatedChunks.push(currentChunk);
65
+ currentChunk = this.cloneChunk(nextChunk);
66
+ continue;
67
+ }
68
+ currentChunk.content += `
69
+ ${nextChunk.content}`;
70
+ currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
71
+ currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
72
+ } else {
73
+ currentChunk = this.cloneChunk(nextChunk);
74
+ }
75
+ }
76
+ if (currentChunk) {
77
+ concatenatedChunks.push(currentChunk);
78
+ }
79
+ return concatenatedChunks;
80
+ }
81
+ cloneChunk(chunk) {
82
+ return {
83
+ types: [...chunk.types],
84
+ content: chunk.content,
85
+ section: {
86
+ level: chunk.section.level,
87
+ path: [...chunk.section.path]
88
+ }
89
+ };
90
+ }
91
+ /**
92
+ * H1 and H2 headings represent major conceptual breaks in the document.
93
+ * Preserving these splits helps maintain the document's logical structure.
94
+ */
95
+ startsNewMajorSection(chunk) {
96
+ return chunk.section.level === 1 || chunk.section.level === 2;
97
+ }
98
+ /**
99
+ * Size limit check to ensure chunks remain within embedding model constraints.
100
+ * Essential for maintaining consistent embedding quality and avoiding truncation.
101
+ */
102
+ wouldExceedMaxSize(currentChunk, nextChunk) {
103
+ if (!currentChunk) {
104
+ return false;
105
+ }
106
+ return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
107
+ }
108
+ /**
109
+ * Checks if one path is a prefix of another path, indicating a parent-child relationship
110
+ */
111
+ isPathIncluded(parentPath, childPath) {
112
+ if (parentPath.length >= childPath.length) return false;
113
+ return parentPath.every((part, i) => part === childPath[i]);
114
+ }
115
+ /**
116
+ * Merges section metadata when concatenating chunks, following these rules:
117
+ * 1. Level: Always uses the lowest (most general) level between chunks
118
+ * 2. Path selection:
119
+ * - For parent-child relationships (one path includes the other), uses the child's path
120
+ * - For siblings/unrelated sections, uses the common parent path
121
+ * - If no common path exists, uses the root path ([])
122
+ */
123
+ mergeSectionInfo(currentChunk, nextChunk) {
124
+ const level = Math.min(currentChunk.section.level, nextChunk.section.level);
125
+ if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
126
+ return currentChunk.section;
127
+ }
128
+ if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
129
+ return {
130
+ path: nextChunk.section.path,
131
+ level
132
+ };
133
+ }
134
+ if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
135
+ return {
136
+ path: currentChunk.section.path,
137
+ level
138
+ };
139
+ }
140
+ const commonPath = this.findCommonPrefix(
141
+ currentChunk.section.path,
142
+ nextChunk.section.path
143
+ );
144
+ return {
145
+ path: commonPath,
146
+ level
147
+ };
148
+ }
149
+ mergeTypes(currentTypes, nextTypes) {
150
+ return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
151
+ }
152
+ /**
153
+ * Returns longest common prefix between two paths
154
+ */
155
+ findCommonPrefix(path1, path2) {
156
+ const common = [];
157
+ for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
158
+ if (path1[i] === path2[i]) {
159
+ common.push(path1[i]);
160
+ } else {
161
+ break;
162
+ }
163
+ }
164
+ return common;
165
+ }
166
+ }
167
+ const fullTrim = (str) => {
168
+ return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
169
+ };
170
+ class CodeContentSplitter {
171
+ constructor(options) {
172
+ this.options = options;
173
+ }
174
+ async split(content) {
175
+ const language = content.match(/^```(\w+)\n/)?.[1];
176
+ const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
177
+ const lines = strippedContent.split("\n");
178
+ const chunks = [];
179
+ let currentChunkLines = [];
180
+ for (const line of lines) {
181
+ const singleLineSize = this.wrap(line, language).length;
182
+ if (singleLineSize > this.options.chunkSize) {
183
+ throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
184
+ }
185
+ currentChunkLines.push(line);
186
+ const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
187
+ const newChunkSize = newChunkContent.length;
188
+ if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
189
+ const lastLine = currentChunkLines.pop();
190
+ chunks.push(this.wrap(currentChunkLines.join("\n"), language));
191
+ currentChunkLines = [lastLine];
192
+ }
193
+ }
194
+ if (currentChunkLines.length > 0) {
195
+ chunks.push(this.wrap(currentChunkLines.join("\n"), language));
196
+ }
197
+ return chunks;
198
+ }
199
+ wrap(content, language) {
200
+ return `\`\`\`${language || ""}
201
+ ${content.replace(/\n+$/, "")}
202
+ \`\`\``;
203
+ }
204
+ }
205
+ class TableContentSplitter {
206
+ constructor(options) {
207
+ this.options = options;
208
+ }
209
+ /**
210
+ * Splits table content into chunks while preserving table structure
211
+ */
212
+ async split(content) {
213
+ const parsedTable = this.parseTable(content);
214
+ if (!parsedTable) {
215
+ return [content];
216
+ }
217
+ const { headers, rows } = parsedTable;
218
+ const chunks = [];
219
+ let currentRows = [];
220
+ for (const row of rows) {
221
+ const singleRowSize = this.wrap(row, headers).length;
222
+ if (singleRowSize > this.options.chunkSize) {
223
+ throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
224
+ }
225
+ const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
226
+ const newChunkSize = newChunkContent.length;
227
+ if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
228
+ chunks.push(this.wrap(currentRows.join("\n"), headers));
229
+ currentRows = [row];
230
+ } else {
231
+ currentRows.push(row);
232
+ }
233
+ }
234
+ if (currentRows.length > 0) {
235
+ chunks.push(this.wrap(currentRows.join("\n"), headers));
236
+ }
237
+ return chunks;
238
+ }
239
+ wrap(content, headers) {
240
+ const headerRow = `| ${headers.join(" | ")} |`;
241
+ const separatorRow = `|${headers.map(() => "---").join("|")}|`;
242
+ return [headerRow, separatorRow, content].join("\n");
243
+ }
244
+ parseTable(content) {
245
+ const lines = content.trim().split("\n");
246
+ if (lines.length < 3) return null;
247
+ const headers = this.parseRow(lines[0]);
248
+ if (!headers) return null;
249
+ const separator = lines[1];
250
+ if (!this.isValidSeparator(separator)) return null;
251
+ const rows = lines.slice(2).filter((row) => row.trim() !== "");
252
+ return { headers, separator, rows };
253
+ }
254
+ /**
255
+ * Parses a table row into cells
256
+ */
257
+ parseRow(row) {
258
+ if (!row.includes("|")) return null;
259
+ return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
260
+ }
261
+ /**
262
+ * Validates the separator row of the table
263
+ */
264
+ isValidSeparator(separator) {
265
+ return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
266
+ }
267
+ }
268
+ class TextContentSplitter {
269
+ constructor(options) {
270
+ this.options = options;
271
+ }
272
+ /**
273
+ * Splits text content into chunks while trying to preserve semantic boundaries.
274
+ * Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
275
+ */
276
+ async split(content) {
277
+ const trimmedContent = fullTrim(content);
278
+ if (trimmedContent.length <= this.options.chunkSize) {
279
+ return [trimmedContent];
280
+ }
281
+ const words = trimmedContent.split(/\s+/);
282
+ const longestWord = words.reduce(
283
+ (max, word) => word.length > max.length ? word : max
284
+ );
285
+ if (longestWord.length > this.options.chunkSize) {
286
+ throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
287
+ }
288
+ const paragraphChunks = this.splitByParagraphs(trimmedContent);
289
+ if (this.areChunksValid(paragraphChunks)) {
290
+ return paragraphChunks;
291
+ }
292
+ const lineChunks = this.splitByLines(trimmedContent);
293
+ if (this.areChunksValid(lineChunks)) {
294
+ return this.mergeChunks(lineChunks, "\n");
295
+ }
296
+ const wordChunks = await this.splitByWords(trimmedContent);
297
+ return this.mergeChunks(wordChunks, " ");
298
+ }
299
+ /**
300
+ * Checks if all chunks are within the maximum size limit
301
+ */
302
+ areChunksValid(chunks) {
303
+ return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
304
+ }
305
+ /**
306
+ * Splits text into chunks by paragraph boundaries (double newlines)
307
+ */
308
+ splitByParagraphs(text) {
309
+ const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
310
+ return paragraphs.filter((chunk) => chunk.length > 2);
311
+ }
312
+ /**
313
+ * Splits text into chunks by line boundaries
314
+ */
315
+ splitByLines(text) {
316
+ const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
317
+ return lines.filter((chunk) => chunk.length > 1);
318
+ }
319
+ /**
320
+ * Uses LangChain's recursive splitter for word-based splitting as a last resort
321
+ */
322
+ async splitByWords(text) {
323
+ const splitter = new RecursiveCharacterTextSplitter({
324
+ chunkSize: this.options.chunkSize,
325
+ chunkOverlap: 0
326
+ });
327
+ const chunks = await splitter.splitText(text);
328
+ return chunks;
329
+ }
330
+ /**
331
+ * Attempts to merge small chunks with previous chunks to minimize fragmentation.
332
+ * Only merges if combined size is within maxChunkSize.
333
+ */
334
+ mergeChunks(chunks, separator) {
335
+ const mergedChunks = [];
336
+ let currentChunk = null;
337
+ for (const chunk of chunks) {
338
+ if (currentChunk === null) {
339
+ currentChunk = chunk;
340
+ continue;
341
+ }
342
+ const currentChunkSize = this.getChunkSize(currentChunk);
343
+ const nextChunkSize = this.getChunkSize(chunk);
344
+ if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
345
+ currentChunk = `${currentChunk}${separator}${chunk}`;
346
+ } else {
347
+ mergedChunks.push(currentChunk);
348
+ currentChunk = chunk;
349
+ }
350
+ }
351
+ if (currentChunk) {
352
+ mergedChunks.push(currentChunk);
353
+ }
354
+ return mergedChunks;
355
+ }
356
+ getChunkSize(chunk) {
357
+ return chunk.length;
358
+ }
359
+ wrap(content) {
360
+ return content;
361
+ }
362
+ }
363
+ class SemanticMarkdownSplitter {
364
+ constructor(preferredChunkSize, maxChunkSize) {
365
+ this.preferredChunkSize = preferredChunkSize;
366
+ this.maxChunkSize = maxChunkSize;
367
+ this.turndownService = new TurndownService({
368
+ headingStyle: "atx",
369
+ hr: "---",
370
+ bulletListMarker: "-",
371
+ codeBlockStyle: "fenced",
372
+ emDelimiter: "_",
373
+ strongDelimiter: "**",
374
+ linkStyle: "inlined"
375
+ });
376
+ this.turndownService.addRule("table", {
377
+ filter: ["table"],
378
+ replacement: (_content, node) => {
379
+ const table = node;
380
+ const headers = Array.from(table.querySelectorAll("th")).map(
381
+ (th) => th.textContent?.trim() || ""
382
+ );
383
+ const rows = Array.from(table.querySelectorAll("tr")).filter(
384
+ (tr) => !tr.querySelector("th")
385
+ );
386
+ if (headers.length === 0 && rows.length === 0) return "";
387
+ let markdown = "\n";
388
+ if (headers.length > 0) {
389
+ markdown += `| ${headers.join(" | ")} |
390
+ `;
391
+ markdown += `|${headers.map(() => "---").join("|")}|
392
+ `;
393
+ }
394
+ for (const row of rows) {
395
+ const cells = Array.from(row.querySelectorAll("td")).map(
396
+ (td) => td.textContent?.trim() || ""
397
+ );
398
+ markdown += `| ${cells.join(" | ")} |
399
+ `;
400
+ }
401
+ return markdown;
402
+ }
403
+ });
404
+ this.textSplitter = new TextContentSplitter({
405
+ chunkSize: this.preferredChunkSize
406
+ });
407
+ this.codeSplitter = new CodeContentSplitter({
408
+ chunkSize: this.maxChunkSize
409
+ });
410
+ this.tableSplitter = new TableContentSplitter({
411
+ chunkSize: this.maxChunkSize
412
+ });
413
+ }
414
+ turndownService;
415
+ textSplitter;
416
+ codeSplitter;
417
+ tableSplitter;
418
+ /**
419
+ * Main entry point for splitting markdown content
420
+ */
421
+ async splitText(markdown) {
422
+ const html = await this.markdownToHtml(markdown);
423
+ const dom = await this.parseHtml(html);
424
+ const sections = await this.splitIntoSections(dom);
425
+ return this.splitSectionContent(sections);
426
+ }
427
+ /**
428
+ * Step 1: Split document into sections based on H1-H6 headings,
429
+ * as well as code blocks and tables.
430
+ */
431
+ async splitIntoSections(dom) {
432
+ const body = dom.querySelector("body");
433
+ if (!body) {
434
+ throw new Error("Invalid HTML structure: no body element found");
435
+ }
436
+ let currentSection = this.createRootSection();
437
+ const sections = [];
438
+ const stack = [currentSection];
439
+ for (const element of Array.from(body.children)) {
440
+ const headingMatch = element.tagName.match(/H([1-6])/);
441
+ if (headingMatch) {
442
+ const level = Number.parseInt(headingMatch[1], 10);
443
+ const title = fullTrim(element.textContent || "");
444
+ while (stack.length > 1 && stack[stack.length - 1].level >= level) {
445
+ stack.pop();
446
+ }
447
+ currentSection = {
448
+ level,
449
+ path: [
450
+ ...stack.slice(1).reduce((acc, s) => {
451
+ const lastPath = s.path[s.path.length - 1];
452
+ if (lastPath) acc.push(lastPath);
453
+ return acc;
454
+ }, []),
455
+ title
456
+ ],
457
+ content: [
458
+ {
459
+ type: "heading",
460
+ text: `${"#".repeat(level)} ${title}`
461
+ }
462
+ ]
463
+ };
464
+ sections.push(currentSection);
465
+ stack.push(currentSection);
466
+ } else if (element.tagName === "PRE") {
467
+ const code = element.querySelector("code");
468
+ const language = code?.className.replace("language-", "") || "";
469
+ const content = code?.textContent || element.textContent || "";
470
+ const markdown = `${"```"}${language}
471
+ ${content}
472
+ ${"```"}`;
473
+ currentSection = {
474
+ level: currentSection.level,
475
+ path: currentSection.path,
476
+ content: [
477
+ {
478
+ type: "code",
479
+ text: markdown
480
+ }
481
+ ]
482
+ };
483
+ sections.push(currentSection);
484
+ } else if (element.tagName === "TABLE") {
485
+ const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
486
+ currentSection = {
487
+ level: currentSection.level,
488
+ path: currentSection.path,
489
+ content: [
490
+ {
491
+ type: "table",
492
+ text: markdown
493
+ }
494
+ ]
495
+ };
496
+ sections.push(currentSection);
497
+ } else {
498
+ const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
499
+ if (markdown) {
500
+ currentSection = {
501
+ level: currentSection.level,
502
+ path: currentSection.path,
503
+ content: [
504
+ {
505
+ type: "text",
506
+ text: markdown
507
+ }
508
+ ]
509
+ };
510
+ sections.push(currentSection);
511
+ }
512
+ }
513
+ }
514
+ return sections;
515
+ }
516
+ /**
517
+ * Step 2: Split section content into smaller chunks
518
+ */
519
+ async splitSectionContent(sections) {
520
+ const chunks = [];
521
+ for (const section of sections) {
522
+ for (const content of section.content) {
523
+ let splitContent = [];
524
+ try {
525
+ switch (content.type) {
526
+ case "heading":
527
+ case "text": {
528
+ splitContent = await this.textSplitter.split(content.text);
529
+ break;
530
+ }
531
+ case "code": {
532
+ splitContent = await this.codeSplitter.split(content.text);
533
+ break;
534
+ }
535
+ case "table": {
536
+ splitContent = await this.tableSplitter.split(content.text);
537
+ break;
538
+ }
539
+ }
540
+ } catch (err) {
541
+ if (err instanceof MinimumChunkSizeError) {
542
+ logger.warn(
543
+ `⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
544
+ );
545
+ const splitter = new RecursiveCharacterTextSplitter({
546
+ chunkSize: this.maxChunkSize,
547
+ chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
548
+ // Use more aggressive separators including empty string as last resort
549
+ separators: [
550
+ "\n\n",
551
+ "\n",
552
+ " ",
553
+ " ",
554
+ ".",
555
+ ",",
556
+ ";",
557
+ ":",
558
+ "-",
559
+ "(",
560
+ ")",
561
+ "[",
562
+ "]",
563
+ "{",
564
+ "}",
565
+ ""
566
+ ]
567
+ });
568
+ const chunks2 = await splitter.splitText(content.text);
569
+ if (chunks2.length === 0) {
570
+ splitContent = [content.text.substring(0, this.maxChunkSize)];
571
+ } else {
572
+ splitContent = chunks2;
573
+ }
574
+ } else {
575
+ const errMessage = err instanceof Error ? err.message : String(err);
576
+ throw new ContentSplitterError(
577
+ `Failed to split ${content.type} content: ${errMessage}`
578
+ );
579
+ }
580
+ }
581
+ chunks.push(
582
+ ...splitContent.map(
583
+ (text) => ({
584
+ types: [content.type],
585
+ content: text,
586
+ section: {
587
+ level: section.level,
588
+ path: section.path
589
+ }
590
+ })
591
+ )
592
+ );
593
+ }
594
+ }
595
+ return chunks;
596
+ }
597
+ /**
598
+ * Helper to create the root section
599
+ */
600
+ createRootSection() {
601
+ return {
602
+ level: 0,
603
+ path: [],
604
+ content: []
605
+ };
606
+ }
607
+ /**
608
+ * Convert markdown to HTML using remark
609
+ */
610
+ async markdownToHtml(markdown) {
611
+ const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
612
+ return `<!DOCTYPE html>
613
+ <html>
614
+ <body>
615
+ ${String(html)}
616
+ </body>
617
+ </html>`;
618
+ }
619
+ /**
620
+ * Parse HTML
621
+ */
622
+ async parseHtml(html) {
623
+ const { window } = createJSDOM(html);
624
+ return window.document;
625
+ }
626
+ }
627
+ const CHILD_LIMIT = 5;
628
+ const SIBLING_LIMIT = 2;
629
+ class DocumentRetrieverService {
630
+ documentStore;
631
+ constructor(documentStore) {
632
+ this.documentStore = documentStore;
633
+ }
634
+ /**
635
+ * Collects all related chunk IDs for a given initial hit.
636
+ * Returns an object with url, hitId, relatedIds (Set), and score.
637
+ */
638
+ async getRelatedChunkIds(library, version, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
639
+ const id = doc.id;
640
+ const url = doc.metadata.url;
641
+ const score = doc.metadata.score;
642
+ const relatedIds = /* @__PURE__ */ new Set();
643
+ relatedIds.add(id);
644
+ const parent = await this.documentStore.findParentChunk(library, version, id);
645
+ if (parent) {
646
+ relatedIds.add(parent.id);
647
+ }
648
+ const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
649
+ library,
650
+ version,
651
+ id,
652
+ siblingLimit
653
+ );
654
+ for (const sib of precedingSiblings) {
655
+ relatedIds.add(sib.id);
656
+ }
657
+ const childChunks = await this.documentStore.findChildChunks(
658
+ library,
659
+ version,
660
+ id,
661
+ childLimit
662
+ );
663
+ for (const child of childChunks) {
664
+ relatedIds.add(child.id);
665
+ }
666
+ const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
667
+ library,
668
+ version,
669
+ id,
670
+ siblingLimit
671
+ );
672
+ for (const sib of subsequentSiblings) {
673
+ relatedIds.add(sib.id);
674
+ }
675
+ return { url, hitId: id, relatedIds, score };
676
+ }
677
+ /**
678
+ * Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
679
+ */
680
+ groupAndPrepareFetch(relatedInfos) {
681
+ const urlMap = /* @__PURE__ */ new Map();
682
+ for (const info of relatedInfos) {
683
+ let entry = urlMap.get(info.url);
684
+ if (!entry) {
685
+ entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
686
+ urlMap.set(info.url, entry);
687
+ }
688
+ for (const id of info.relatedIds) {
689
+ entry.uniqueChunkIds.add(id);
690
+ }
691
+ if (info.score > entry.maxScore) {
692
+ entry.maxScore = info.score;
693
+ }
694
+ }
695
+ return urlMap;
696
+ }
697
+ /**
698
+ * Finalizes the merged result for a URL group by fetching, sorting, and joining content.
699
+ */
700
+ async finalizeResult(library, version, url, uniqueChunkIds, maxScore) {
701
+ const ids = Array.from(uniqueChunkIds);
702
+ const docs = await this.documentStore.findChunksByIds(library, version, ids);
703
+ const content = docs.map((d) => d.pageContent).join("\n\n");
704
+ return {
705
+ url,
706
+ content,
707
+ score: maxScore
708
+ };
709
+ }
710
+ /**
711
+ * Searches for documents and expands the context around the matches.
712
+ * @param library The library name.
713
+ * @param version The library version.
714
+ * @param query The search query.
715
+ * @param version The library version (optional, defaults to searching documents without a version).
716
+ * @param query The search query.
717
+ * @param limit The optional limit for the initial search results.
718
+ * @returns An array of strings representing the aggregated content of the retrieved chunks.
719
+ */
720
+ async search(library, version, query, limit) {
721
+ const normalizedVersion = (version ?? "").toLowerCase();
722
+ const initialResults = await this.documentStore.findByContent(
723
+ library,
724
+ normalizedVersion,
725
+ query,
726
+ limit ?? 10
727
+ );
728
+ const relatedInfos = await Promise.all(
729
+ initialResults.map(
730
+ (doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
731
+ )
732
+ );
733
+ const urlMap = this.groupAndPrepareFetch(relatedInfos);
734
+ const results = [];
735
+ for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
736
+ const result = await this.finalizeResult(
737
+ library,
738
+ normalizedVersion,
739
+ url,
740
+ uniqueChunkIds,
741
+ maxScore
742
+ );
743
+ results.push(result);
744
+ }
745
+ return results;
746
+ }
747
+ }
748
+ class DocumentStore {
749
+ db;
750
+ embeddings;
751
+ dbDimension = VECTOR_DIMENSION;
752
+ modelDimension;
753
+ statements;
754
+ /**
755
+ * Calculates Reciprocal Rank Fusion score for a result
756
+ */
757
+ calculateRRF(vecRank, ftsRank, k = 60) {
758
+ let rrf = 0;
759
+ if (vecRank !== void 0) {
760
+ rrf += 1 / (k + vecRank);
761
+ }
762
+ if (ftsRank !== void 0) {
763
+ rrf += 1 / (k + ftsRank);
764
+ }
765
+ return rrf;
766
+ }
767
+ /**
768
+ * Assigns ranks to search results based on their scores
769
+ */
770
+ assignRanks(results) {
771
+ const vecRanks = /* @__PURE__ */ new Map();
772
+ const ftsRanks = /* @__PURE__ */ new Map();
773
+ results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
774
+ vecRanks.set(Number(result.id), index + 1);
775
+ });
776
+ results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
777
+ ftsRanks.set(Number(result.id), index + 1);
778
+ });
779
+ return results.map((result) => ({
780
+ ...result,
781
+ vec_rank: vecRanks.get(Number(result.id)),
782
+ fts_rank: ftsRanks.get(Number(result.id)),
783
+ rrf_score: this.calculateRRF(
784
+ vecRanks.get(Number(result.id)),
785
+ ftsRanks.get(Number(result.id))
786
+ )
787
+ }));
788
+ }
789
+ constructor(dbPath) {
790
+ if (!dbPath) {
791
+ throw new StoreError("Missing required database path");
792
+ }
793
+ this.db = new Database(dbPath);
794
+ }
795
+ /**
796
+ * Sets up prepared statements for database queries
797
+ */
798
+ prepareStatements() {
799
+ const statements = {
800
+ getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
801
+ insertDocument: this.db.prepare(
802
+ "INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
803
+ ),
804
+ insertEmbedding: this.db.prepare(
805
+ "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
806
+ ),
807
+ insertLibrary: this.db.prepare(
808
+ "INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
809
+ ),
810
+ getLibraryIdByName: this.db.prepare(
811
+ "SELECT id FROM libraries WHERE name = ?"
812
+ ),
813
+ // New version-related statements
814
+ insertVersion: this.db.prepare(
815
+ "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
816
+ ),
817
+ resolveVersionId: this.db.prepare(
818
+ "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
819
+ ),
820
+ getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
821
+ queryVersionsByLibraryId: this.db.prepare(
822
+ "SELECT * FROM versions WHERE library_id = ? ORDER BY name"
823
+ ),
824
+ deleteLibraryDocuments: this.db.prepare(
825
+ `DELETE FROM documents
826
+ WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
827
+ AND version_id = (
828
+ SELECT v.id FROM versions v
829
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
830
+ AND COALESCE(v.name, '') = COALESCE(?, '')
831
+ )`
832
+ ),
833
+ deleteDocuments: this.db.prepare(
834
+ `DELETE FROM documents
835
+ WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
836
+ AND version_id = (
837
+ SELECT v.id FROM versions v
838
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
839
+ AND COALESCE(v.name, '') = COALESCE(?, '')
840
+ )`
841
+ ),
842
+ deleteDocumentsByUrl: this.db.prepare(
843
+ `DELETE FROM documents
844
+ WHERE url = ?
845
+ AND library_id = (SELECT id FROM libraries WHERE name = ?)
846
+ AND version_id = (
847
+ SELECT v.id FROM versions v
848
+ WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
849
+ AND COALESCE(v.name, '') = COALESCE(?, '')
850
+ )`
851
+ ),
852
+ getDocumentBySort: this.db.prepare(
853
+ `SELECT d.id
854
+ FROM documents d
855
+ JOIN versions v ON d.version_id = v.id
856
+ JOIN libraries l ON v.library_id = l.id
857
+ WHERE l.name = ?
858
+ AND COALESCE(v.name, '') = COALESCE(?, '')
859
+ LIMIT 1`
860
+ ),
861
+ queryVersions: this.db.prepare(
862
+ `SELECT DISTINCT v.name
863
+ FROM versions v
864
+ JOIN libraries l ON v.library_id = l.id
865
+ WHERE l.name = ?
866
+ ORDER BY v.name`
867
+ ),
868
+ checkExists: this.db.prepare(
869
+ `SELECT d.id FROM documents d
870
+ JOIN versions v ON d.version_id = v.id
871
+ JOIN libraries l ON v.library_id = l.id
872
+ WHERE l.name = ?
873
+ AND COALESCE(v.name, '') = COALESCE(?, '')
874
+ LIMIT 1`
875
+ ),
876
+ // Library/version aggregation including versions without documents and status/progress fields
877
+ queryLibraryVersions: this.db.prepare(
878
+ `SELECT
879
+ l.name as library,
880
+ COALESCE(v.name, '') as version,
881
+ v.id as versionId,
882
+ v.status as status,
883
+ v.progress_pages as progressPages,
884
+ v.progress_max_pages as progressMaxPages,
885
+ v.source_url as sourceUrl,
886
+ MIN(d.indexed_at) as indexedAt,
887
+ COUNT(d.id) as documentCount,
888
+ COUNT(DISTINCT d.url) as uniqueUrlCount
889
+ FROM versions v
890
+ JOIN libraries l ON v.library_id = l.id
891
+ LEFT JOIN documents d ON d.version_id = v.id
892
+ GROUP BY v.id
893
+ ORDER BY l.name, version`
894
+ ),
895
+ getChildChunks: this.db.prepare(`
896
+ SELECT d.* FROM documents d
897
+ JOIN versions v ON d.version_id = v.id
898
+ JOIN libraries l ON v.library_id = l.id
899
+ WHERE l.name = ?
900
+ AND COALESCE(v.name, '') = COALESCE(?, '')
901
+ AND d.url = ?
902
+ AND json_array_length(json_extract(d.metadata, '$.path')) = ?
903
+ AND json_extract(d.metadata, '$.path') LIKE ? || '%'
904
+ AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
905
+ ORDER BY d.sort_order
906
+ LIMIT ?
907
+ `),
908
+ getPrecedingSiblings: this.db.prepare(`
909
+ SELECT d.* FROM documents d
910
+ JOIN versions v ON d.version_id = v.id
911
+ JOIN libraries l ON v.library_id = l.id
912
+ WHERE l.name = ?
913
+ AND COALESCE(v.name, '') = COALESCE(?, '')
914
+ AND d.url = ?
915
+ AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
916
+ AND json_extract(d.metadata, '$.path') = ?
917
+ ORDER BY d.sort_order DESC
918
+ LIMIT ?
919
+ `),
920
+ getSubsequentSiblings: this.db.prepare(`
921
+ SELECT d.* FROM documents d
922
+ JOIN versions v ON d.version_id = v.id
923
+ JOIN libraries l ON v.library_id = l.id
924
+ WHERE l.name = ?
925
+ AND COALESCE(v.name, '') = COALESCE(?, '')
926
+ AND d.url = ?
927
+ AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
928
+ AND json_extract(d.metadata, '$.path') = ?
929
+ ORDER BY d.sort_order
930
+ LIMIT ?
931
+ `),
932
+ getParentChunk: this.db.prepare(`
933
+ SELECT d.* FROM documents d
934
+ JOIN versions v ON d.version_id = v.id
935
+ JOIN libraries l ON v.library_id = l.id
936
+ WHERE l.name = ?
937
+ AND COALESCE(v.name, '') = COALESCE(?, '')
938
+ AND d.url = ?
939
+ AND json_extract(d.metadata, '$.path') = ?
940
+ AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
941
+ ORDER BY d.sort_order DESC
942
+ LIMIT 1
943
+ `),
944
+ // Status tracking statements
945
+ updateVersionStatus: this.db.prepare(
946
+ "UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
947
+ ),
948
+ updateVersionProgress: this.db.prepare(
949
+ "UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
950
+ ),
951
+ getVersionsByStatus: this.db.prepare(
952
+ "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
953
+ ),
954
+ // Scraper options statements
955
+ updateVersionScraperOptions: this.db.prepare(
956
+ "UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
957
+ ),
958
+ getVersionWithOptions: this.db.prepare(
959
+ "SELECT * FROM versions WHERE id = ?"
960
+ ),
961
+ getVersionsBySourceUrl: this.db.prepare(
962
+ "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
963
+ )
964
+ };
965
+ this.statements = statements;
966
+ }
967
+ /**
968
+ * Pads a vector to the fixed database dimension by appending zeros.
969
+ * Throws an error if the input vector is longer than the database dimension.
970
+ */
971
+ padVector(vector) {
972
+ if (vector.length > this.dbDimension) {
973
+ throw new Error(
974
+ `Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
975
+ );
976
+ }
977
+ if (vector.length === this.dbDimension) {
978
+ return vector;
979
+ }
980
+ return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
981
+ }
982
+ /**
983
+ * Initializes embeddings client using environment variables for configuration.
984
+ *
985
+ * The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
986
+ * Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
987
+ * for OpenAI (default).
988
+ *
989
+ * Supported providers and their required environment variables:
990
+ * - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
991
+ * - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
992
+ * - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
993
+ * - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
994
+ */
995
+ async initializeEmbeddings() {
996
+ const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
997
+ const { createEmbeddingModel } = await import("./EmbeddingFactory-CElwVk3X.js");
998
+ this.embeddings = createEmbeddingModel(modelSpec);
999
+ const testVector = await this.embeddings.embedQuery("test");
1000
+ this.modelDimension = testVector.length;
1001
+ if (this.modelDimension > this.dbDimension) {
1002
+ throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
1003
+ }
1004
+ }
1005
+ /**
1006
+ * Escapes a query string for use with SQLite FTS5 MATCH operator.
1007
+ * Wraps the query in double quotes and escapes internal double quotes.
1008
+ */
1009
+ escapeFtsQuery(query) {
1010
+ const escapedQuotes = query.replace(/"/g, '""');
1011
+ return `"${escapedQuotes}"`;
1012
+ }
1013
+ /**
1014
+ * Initializes database connection and ensures readiness
1015
+ */
1016
+ async initialize() {
1017
+ try {
1018
+ sqliteVec.load(this.db);
1019
+ applyMigrations(this.db);
1020
+ this.prepareStatements();
1021
+ await this.initializeEmbeddings();
1022
+ } catch (error) {
1023
+ if (error instanceof StoreError) {
1024
+ throw error;
1025
+ }
1026
+ throw new ConnectionError("Failed to initialize database connection", error);
1027
+ }
1028
+ }
1029
+ /**
1030
+ * Gracefully closes database connections
1031
+ */
1032
+ async shutdown() {
1033
+ this.db.close();
1034
+ }
1035
+ /**
1036
+ * Resolves a library name and version string to library_id and version_id.
1037
+ * Creates library and version records if they don't exist.
1038
+ */
1039
+ async resolveLibraryAndVersionIds(library, version) {
1040
+ const normalizedLibrary = library.toLowerCase();
1041
+ const normalizedVersion = denormalizeVersionName(version.toLowerCase());
1042
+ this.statements.insertLibrary.run(normalizedLibrary);
1043
+ const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
1044
+ if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
1045
+ throw new StoreError(`Failed to resolve library_id for library: ${library}`);
1046
+ }
1047
+ const libraryId = libraryIdRow.id;
1048
+ this.statements.insertVersion.run(libraryId, normalizedVersion);
1049
+ const versionIdRow = this.statements.resolveVersionId.get(
1050
+ libraryId,
1051
+ normalizedVersion === null ? "" : normalizedVersion
1052
+ );
1053
+ if (!versionIdRow || typeof versionIdRow.id !== "number") {
1054
+ throw new StoreError(
1055
+ `Failed to resolve version_id for library: ${library}, version: ${version}`
1056
+ );
1057
+ }
1058
+ return { libraryId, versionId: versionIdRow.id };
1059
+ }
1060
+ /**
1061
+ * Retrieves all unique versions for a specific library
1062
+ */
1063
+ async queryUniqueVersions(library) {
1064
+ try {
1065
+ const rows = this.statements.queryVersions.all(library.toLowerCase());
1066
+ return rows.map((row) => normalizeVersionName(row.name));
1067
+ } catch (error) {
1068
+ throw new ConnectionError("Failed to query versions", error);
1069
+ }
1070
+ }
1071
+ /**
1072
+ * Updates the status of a version record in the database.
1073
+ * @param versionId The version ID to update
1074
+ * @param status The new status to set
1075
+ * @param errorMessage Optional error message for failed statuses
1076
+ */
1077
+ async updateVersionStatus(versionId, status, errorMessage) {
1078
+ try {
1079
+ this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
1080
+ } catch (error) {
1081
+ throw new StoreError(`Failed to update version status: ${error}`);
1082
+ }
1083
+ }
1084
+ /**
1085
+ * Updates the progress counters for a version being indexed.
1086
+ * @param versionId The version ID to update
1087
+ * @param pages Current number of pages processed
1088
+ * @param maxPages Total number of pages to process
1089
+ */
1090
+ async updateVersionProgress(versionId, pages, maxPages) {
1091
+ try {
1092
+ this.statements.updateVersionProgress.run(pages, maxPages, versionId);
1093
+ } catch (error) {
1094
+ throw new StoreError(`Failed to update version progress: ${error}`);
1095
+ }
1096
+ }
1097
+ /**
1098
+ * Retrieves versions by their status.
1099
+ * @param statuses Array of statuses to filter by
1100
+ * @returns Array of version records matching the statuses
1101
+ */
1102
+ async getVersionsByStatus(statuses) {
1103
+ try {
1104
+ const statusJson = JSON.stringify(statuses);
1105
+ const rows = this.statements.getVersionsByStatus.all(
1106
+ statusJson
1107
+ );
1108
+ return rows;
1109
+ } catch (error) {
1110
+ throw new StoreError(`Failed to get versions by status: ${error}`);
1111
+ }
1112
+ }
1113
+ /**
1114
+ * Stores scraper options for a version to enable reproducible indexing.
1115
+ * @param versionId The version ID to update
1116
+ * @param options Complete scraper options used for indexing
1117
+ */
1118
+ async storeScraperOptions(versionId, options) {
1119
+ try {
1120
+ const { url: source_url, library, version, signal, ...scraper_options } = options;
1121
+ const optionsJson = JSON.stringify(scraper_options);
1122
+ this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
1123
+ } catch (error) {
1124
+ throw new StoreError(`Failed to store scraper options: ${error}`);
1125
+ }
1126
+ }
1127
+ /**
1128
+ * Retrieves stored scraping configuration (source URL and options) for a version.
1129
+ * Returns null when no source URL is recorded (not re-indexable).
1130
+ */
1131
+ async getScraperOptions(versionId) {
1132
+ try {
1133
+ const row = this.statements.getVersionWithOptions.get(versionId);
1134
+ if (!row?.source_url) {
1135
+ return null;
1136
+ }
1137
+ let parsed = {};
1138
+ if (row.scraper_options) {
1139
+ try {
1140
+ parsed = JSON.parse(row.scraper_options);
1141
+ } catch (e) {
1142
+ logger.warn(`âš ī¸ Invalid scraper_options JSON for version ${versionId}: ${e}`);
1143
+ parsed = {};
1144
+ }
1145
+ }
1146
+ return { sourceUrl: row.source_url, options: parsed };
1147
+ } catch (error) {
1148
+ throw new StoreError(`Failed to get scraper options: ${error}`);
1149
+ }
1150
+ }
1151
+ /**
1152
+ * Finds versions that were indexed from the same source URL.
1153
+ * Useful for finding similar configurations or detecting duplicates.
1154
+ * @param url Source URL to search for
1155
+ * @returns Array of versions with the same source URL
1156
+ */
1157
+ async findVersionsBySourceUrl(url) {
1158
+ try {
1159
+ const rows = this.statements.getVersionsBySourceUrl.all(
1160
+ url
1161
+ );
1162
+ return rows;
1163
+ } catch (error) {
1164
+ throw new StoreError(`Failed to find versions by source URL: ${error}`);
1165
+ }
1166
+ }
1167
+ /**
1168
+ * Verifies existence of documents for a specific library version
1169
+ */
1170
+ async checkDocumentExists(library, version) {
1171
+ try {
1172
+ const normalizedVersion = version.toLowerCase();
1173
+ const result = this.statements.checkExists.get(
1174
+ library.toLowerCase(),
1175
+ normalizedVersion
1176
+ );
1177
+ return result !== void 0;
1178
+ } catch (error) {
1179
+ throw new ConnectionError("Failed to check document existence", error);
1180
+ }
1181
+ }
1182
+ /**
1183
+ * Retrieves a mapping of all libraries to their available versions with details.
1184
+ */
1185
+ async queryLibraryVersions() {
1186
+ try {
1187
+ const rows = this.statements.queryLibraryVersions.all();
1188
+ const libraryMap = /* @__PURE__ */ new Map();
1189
+ for (const row of rows) {
1190
+ const library = row.library;
1191
+ if (!libraryMap.has(library)) {
1192
+ libraryMap.set(library, []);
1193
+ }
1194
+ const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
1195
+ libraryMap.get(library)?.push({
1196
+ version: row.version,
1197
+ versionId: row.versionId,
1198
+ // Preserve raw string status here; DocumentManagementService will cast to VersionStatus
1199
+ status: row.status,
1200
+ progressPages: row.progressPages,
1201
+ progressMaxPages: row.progressMaxPages,
1202
+ sourceUrl: row.sourceUrl,
1203
+ documentCount: row.documentCount,
1204
+ uniqueUrlCount: row.uniqueUrlCount,
1205
+ indexedAt: indexedAtISO
1206
+ });
1207
+ }
1208
+ for (const versions of libraryMap.values()) {
1209
+ versions.sort((a, b) => {
1210
+ if (a.version === "" && b.version !== "") {
1211
+ return -1;
1212
+ }
1213
+ if (a.version !== "" && b.version === "") {
1214
+ return 1;
1215
+ }
1216
+ if (a.version === "" && b.version === "") {
1217
+ return 0;
1218
+ }
1219
+ try {
1220
+ return semver__default.compare(a.version, b.version);
1221
+ } catch (_error) {
1222
+ return a.version.localeCompare(b.version);
1223
+ }
1224
+ });
1225
+ }
1226
+ return libraryMap;
1227
+ } catch (error) {
1228
+ throw new ConnectionError("Failed to query library versions", error);
1229
+ }
1230
+ }
1231
+ /**
1232
+ * Stores documents with library and version metadata, generating embeddings
1233
+ * for vector similarity search. Automatically removes any existing documents
1234
+ * for the same URLs before adding new ones to prevent UNIQUE constraint violations.
1235
+ */
1236
+ async addDocuments(library, version, documents) {
1237
+ try {
1238
+ if (documents.length === 0) {
1239
+ return;
1240
+ }
1241
+ const urls = /* @__PURE__ */ new Set();
1242
+ for (const doc of documents) {
1243
+ const url = doc.metadata.url;
1244
+ if (!url || typeof url !== "string" || !url.trim()) {
1245
+ throw new StoreError("Document metadata must include a valid URL");
1246
+ }
1247
+ urls.add(url);
1248
+ }
1249
+ const texts = documents.map((doc) => {
1250
+ const header = `<title>${doc.metadata.title}</title>
1251
+ <url>${doc.metadata.url}</url>
1252
+ <path>${doc.metadata.path.join(" / ")}</path>
1253
+ `;
1254
+ return `${header}${doc.pageContent}`;
1255
+ });
1256
+ const maxBatchChars = Number(process.env.DOCS_MCP_EMBEDDING_BATCH_CHARS) || EMBEDDING_BATCH_CHARS;
1257
+ const rawEmbeddings = [];
1258
+ let currentBatch = [];
1259
+ let currentBatchSize = 0;
1260
+ let batchCount = 0;
1261
+ for (const text of texts) {
1262
+ const textSize = text.length;
1263
+ if (currentBatchSize + textSize > maxBatchChars && currentBatch.length > 0) {
1264
+ batchCount++;
1265
+ logger.debug(
1266
+ `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
1267
+ );
1268
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
1269
+ rawEmbeddings.push(...batchEmbeddings);
1270
+ currentBatch = [];
1271
+ currentBatchSize = 0;
1272
+ }
1273
+ currentBatch.push(text);
1274
+ currentBatchSize += textSize;
1275
+ if (currentBatch.length >= EMBEDDING_BATCH_SIZE) {
1276
+ batchCount++;
1277
+ logger.debug(
1278
+ `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
1279
+ );
1280
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
1281
+ rawEmbeddings.push(...batchEmbeddings);
1282
+ currentBatch = [];
1283
+ currentBatchSize = 0;
1284
+ }
1285
+ }
1286
+ if (currentBatch.length > 0) {
1287
+ batchCount++;
1288
+ logger.debug(
1289
+ `🔄 Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
1290
+ );
1291
+ const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
1292
+ rawEmbeddings.push(...batchEmbeddings);
1293
+ }
1294
+ const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
1295
+ const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
1296
+ library,
1297
+ version
1298
+ );
1299
+ for (const url of urls) {
1300
+ const deletedCount = await this.deleteDocumentsByUrl(library, version, url);
1301
+ if (deletedCount > 0) {
1302
+ logger.debug(`đŸ—‘ī¸ Deleted ${deletedCount} existing documents for URL: ${url}`);
1303
+ }
1304
+ }
1305
+ const transaction = this.db.transaction((docs) => {
1306
+ for (let i = 0; i < docs.length; i++) {
1307
+ const doc = docs[i];
1308
+ const url = doc.metadata.url;
1309
+ const result = this.statements.insertDocument.run(
1310
+ BigInt(libraryId),
1311
+ BigInt(versionId),
1312
+ url,
1313
+ doc.pageContent,
1314
+ JSON.stringify(doc.metadata),
1315
+ i,
1316
+ (/* @__PURE__ */ new Date()).toISOString()
1317
+ // Pass current timestamp for indexed_at
1318
+ );
1319
+ const rowId = result.lastInsertRowid;
1320
+ this.statements.insertEmbedding.run(
1321
+ BigInt(rowId),
1322
+ BigInt(libraryId),
1323
+ BigInt(versionId),
1324
+ JSON.stringify(paddedEmbeddings[i])
1325
+ );
1326
+ }
1327
+ });
1328
+ transaction(documents);
1329
+ } catch (error) {
1330
+ throw new ConnectionError("Failed to add documents to store", error);
1331
+ }
1332
+ }
1333
+ /**
1334
+ * Removes documents matching specified library and version
1335
+ * @returns Number of documents deleted
1336
+ */
1337
+ async deleteDocuments(library, version) {
1338
+ try {
1339
+ const normalizedVersion = version.toLowerCase();
1340
+ const result = this.statements.deleteDocuments.run(
1341
+ library.toLowerCase(),
1342
+ library.toLowerCase(),
1343
+ // library name appears twice in the query
1344
+ normalizedVersion
1345
+ );
1346
+ return result.changes;
1347
+ } catch (error) {
1348
+ throw new ConnectionError("Failed to delete documents", error);
1349
+ }
1350
+ }
1351
+ /**
1352
+ * Removes documents for a specific URL within a library and version
1353
+ * @returns Number of documents deleted
1354
+ */
1355
+ async deleteDocumentsByUrl(library, version, url) {
1356
+ try {
1357
+ const normalizedVersion = version.toLowerCase();
1358
+ const result = this.statements.deleteDocumentsByUrl.run(
1359
+ url,
1360
+ library.toLowerCase(),
1361
+ library.toLowerCase(),
1362
+ // library name appears twice in the query
1363
+ normalizedVersion
1364
+ );
1365
+ return result.changes;
1366
+ } catch (error) {
1367
+ throw new ConnectionError("Failed to delete documents by URL", error);
1368
+ }
1369
+ }
1370
+ /**
1371
+ * Retrieves a document by its ID.
1372
+ * @param id The ID of the document.
1373
+ * @returns The document, or null if not found.
1374
+ */
1375
+ async getById(id) {
1376
+ try {
1377
+ const row = this.statements.getById.get(BigInt(id));
1378
+ if (!row) {
1379
+ return null;
1380
+ }
1381
+ return mapDbDocumentToDocument(row);
1382
+ } catch (error) {
1383
+ throw new ConnectionError(`Failed to get document by ID ${id}`, error);
1384
+ }
1385
+ }
1386
+ /**
1387
+ * Finds documents matching a text query using hybrid search.
1388
+ * Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
1389
+ */
1390
+ async findByContent(library, version, query, limit) {
1391
+ try {
1392
+ const rawEmbedding = await this.embeddings.embedQuery(query);
1393
+ const embedding = this.padVector(rawEmbedding);
1394
+ const ftsQuery = this.escapeFtsQuery(query);
1395
+ const normalizedVersion = version.toLowerCase();
1396
+ const stmt = this.db.prepare(`
1397
+ WITH vec_distances AS (
1398
+ SELECT
1399
+ dv.rowid as id,
1400
+ dv.distance as vec_distance
1401
+ FROM documents_vec dv
1402
+ JOIN versions v ON dv.version_id = v.id
1403
+ JOIN libraries l ON v.library_id = l.id
1404
+ WHERE l.name = ?
1405
+ AND COALESCE(v.name, '') = COALESCE(?, '')
1406
+ AND dv.embedding MATCH ?
1407
+ AND dv.k = ?
1408
+ ORDER BY dv.distance
1409
+ ),
1410
+ fts_scores AS (
1411
+ SELECT
1412
+ f.rowid as id,
1413
+ bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
1414
+ FROM documents_fts f
1415
+ JOIN documents d ON f.rowid = d.id
1416
+ JOIN versions v ON d.version_id = v.id
1417
+ JOIN libraries l ON v.library_id = l.id
1418
+ WHERE l.name = ?
1419
+ AND COALESCE(v.name, '') = COALESCE(?, '')
1420
+ AND documents_fts MATCH ?
1421
+ ORDER BY fts_score
1422
+ LIMIT ?
1423
+ )
1424
+ SELECT
1425
+ d.id,
1426
+ d.content,
1427
+ d.metadata,
1428
+ COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
1429
+ COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
1430
+ FROM documents d
1431
+ LEFT JOIN vec_distances v ON d.id = v.id
1432
+ LEFT JOIN fts_scores f ON d.id = f.id
1433
+ WHERE v.id IS NOT NULL OR f.id IS NOT NULL
1434
+ `);
1435
+ const rawResults = stmt.all(
1436
+ library.toLowerCase(),
1437
+ normalizedVersion,
1438
+ JSON.stringify(embedding),
1439
+ limit,
1440
+ library.toLowerCase(),
1441
+ normalizedVersion,
1442
+ ftsQuery,
1443
+ // Use the escaped query
1444
+ limit
1445
+ );
1446
+ const rankedResults = this.assignRanks(rawResults);
1447
+ const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
1448
+ return topResults.map((row) => ({
1449
+ ...mapDbDocumentToDocument(row),
1450
+ metadata: {
1451
+ ...JSON.parse(row.metadata),
1452
+ id: row.id,
1453
+ score: row.rrf_score,
1454
+ vec_rank: row.vec_rank,
1455
+ fts_rank: row.fts_rank
1456
+ }
1457
+ }));
1458
+ } catch (error) {
1459
+ throw new ConnectionError(
1460
+ `Failed to find documents by content with query "${query}"`,
1461
+ error
1462
+ );
1463
+ }
1464
+ }
1465
+ /**
1466
+ * Finds child chunks of a given document based on path hierarchy.
1467
+ */
1468
+ async findChildChunks(library, version, id, limit) {
1469
+ try {
1470
+ const parent = await this.getById(id);
1471
+ if (!parent) {
1472
+ return [];
1473
+ }
1474
+ const parentPath = parent.metadata.path ?? [];
1475
+ const parentUrl = parent.metadata.url;
1476
+ const normalizedVersion = version.toLowerCase();
1477
+ const result = this.statements.getChildChunks.all(
1478
+ library.toLowerCase(),
1479
+ normalizedVersion,
1480
+ parentUrl,
1481
+ parentPath.length + 1,
1482
+ JSON.stringify(parentPath),
1483
+ BigInt(id),
1484
+ limit
1485
+ );
1486
+ return result.map((row) => mapDbDocumentToDocument(row));
1487
+ } catch (error) {
1488
+ throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
1489
+ }
1490
+ }
1491
+ /**
1492
+ * Finds preceding sibling chunks of a given document.
1493
+ */
1494
+ async findPrecedingSiblingChunks(library, version, id, limit) {
1495
+ try {
1496
+ const reference = await this.getById(id);
1497
+ if (!reference) {
1498
+ return [];
1499
+ }
1500
+ const refMetadata = reference.metadata;
1501
+ const normalizedVersion = version.toLowerCase();
1502
+ const result = this.statements.getPrecedingSiblings.all(
1503
+ library.toLowerCase(),
1504
+ normalizedVersion,
1505
+ refMetadata.url,
1506
+ BigInt(id),
1507
+ JSON.stringify(refMetadata.path),
1508
+ limit
1509
+ );
1510
+ return result.reverse().map((row) => mapDbDocumentToDocument(row));
1511
+ } catch (error) {
1512
+ throw new ConnectionError(
1513
+ `Failed to find preceding sibling chunks for ID ${id}`,
1514
+ error
1515
+ );
1516
+ }
1517
+ }
1518
+ /**
1519
+ * Finds subsequent sibling chunks of a given document.
1520
+ */
1521
+ async findSubsequentSiblingChunks(library, version, id, limit) {
1522
+ try {
1523
+ const reference = await this.getById(id);
1524
+ if (!reference) {
1525
+ return [];
1526
+ }
1527
+ const refMetadata = reference.metadata;
1528
+ const normalizedVersion = version.toLowerCase();
1529
+ const result = this.statements.getSubsequentSiblings.all(
1530
+ library.toLowerCase(),
1531
+ normalizedVersion,
1532
+ refMetadata.url,
1533
+ BigInt(id),
1534
+ JSON.stringify(refMetadata.path),
1535
+ limit
1536
+ );
1537
+ return result.map((row) => mapDbDocumentToDocument(row));
1538
+ } catch (error) {
1539
+ throw new ConnectionError(
1540
+ `Failed to find subsequent sibling chunks for ID ${id}`,
1541
+ error
1542
+ );
1543
+ }
1544
+ }
1545
+ /**
1546
+ * Finds the parent chunk of a given document.
1547
+ */
1548
+ async findParentChunk(library, version, id) {
1549
+ try {
1550
+ const child = await this.getById(id);
1551
+ if (!child) {
1552
+ return null;
1553
+ }
1554
+ const childMetadata = child.metadata;
1555
+ const path2 = childMetadata.path ?? [];
1556
+ const parentPath = path2.slice(0, -1);
1557
+ if (parentPath.length === 0) {
1558
+ return null;
1559
+ }
1560
+ const normalizedVersion = version.toLowerCase();
1561
+ const result = this.statements.getParentChunk.get(
1562
+ library.toLowerCase(),
1563
+ normalizedVersion,
1564
+ childMetadata.url,
1565
+ JSON.stringify(parentPath),
1566
+ BigInt(id)
1567
+ );
1568
+ if (!result) {
1569
+ return null;
1570
+ }
1571
+ return mapDbDocumentToDocument(result);
1572
+ } catch (error) {
1573
+ throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
1574
+ }
1575
+ }
1576
+ /**
1577
+ * Fetches multiple documents by their IDs in a single call.
1578
+ * Returns an array of Document objects, sorted by their sort_order.
1579
+ */
1580
+ async findChunksByIds(library, version, ids) {
1581
+ if (!ids.length) return [];
1582
+ try {
1583
+ const normalizedVersion = version.toLowerCase();
1584
+ const placeholders = ids.map(() => "?").join(",");
1585
+ const stmt = this.db.prepare(
1586
+ `SELECT d.* FROM documents d
1587
+ JOIN libraries l ON d.library_id = l.id
1588
+ JOIN versions v ON d.version_id = v.id
1589
+ WHERE l.name = ?
1590
+ AND COALESCE(v.name, '') = COALESCE(?, '')
1591
+ AND d.id IN (${placeholders})
1592
+ ORDER BY d.sort_order`
1593
+ );
1594
+ const rows = stmt.all(
1595
+ library.toLowerCase(),
1596
+ normalizedVersion,
1597
+ ...ids
1598
+ );
1599
+ return rows.map((row) => mapDbDocumentToDocument(row));
1600
+ } catch (error) {
1601
+ throw new ConnectionError("Failed to fetch documents by IDs", error);
1602
+ }
1603
+ }
1604
+ }
1605
+ class DocumentManagementService {
1606
+ store;
1607
+ documentRetriever;
1608
+ splitter;
1609
+ /**
1610
+ * Normalizes a version string, converting null or undefined to an empty string
1611
+ * and converting to lowercase.
1612
+ */
1613
+ normalizeVersion(version) {
1614
+ return (version ?? "").toLowerCase();
1615
+ }
1616
+ constructor() {
1617
+ let dbPath;
1618
+ let dbDir;
1619
+ const envStorePath = process.env.DOCS_MCP_STORE_PATH;
1620
+ if (envStorePath) {
1621
+ dbDir = envStorePath;
1622
+ dbPath = path.join(dbDir, "documents.db");
1623
+ logger.debug(`💾 Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
1624
+ } else {
1625
+ const projectRoot = getProjectRoot();
1626
+ const oldDbDir = path.join(projectRoot, ".store");
1627
+ const oldDbPath = path.join(oldDbDir, "documents.db");
1628
+ const oldDbExists = fs.existsSync(oldDbPath);
1629
+ if (oldDbExists) {
1630
+ dbPath = oldDbPath;
1631
+ dbDir = oldDbDir;
1632
+ logger.debug(`💾 Using legacy database path: ${dbPath}`);
1633
+ } else {
1634
+ const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
1635
+ dbDir = standardPaths.data;
1636
+ dbPath = path.join(dbDir, "documents.db");
1637
+ logger.debug(`💾 Using standard database directory: ${dbDir}`);
1638
+ }
1639
+ }
1640
+ try {
1641
+ fs.mkdirSync(dbDir, { recursive: true });
1642
+ } catch (error) {
1643
+ logger.error(`âš ī¸ Failed to create database directory ${dbDir}: ${error}`);
1644
+ }
1645
+ this.store = new DocumentStore(dbPath);
1646
+ this.documentRetriever = new DocumentRetrieverService(this.store);
1647
+ const semanticSplitter = new SemanticMarkdownSplitter(
1648
+ SPLITTER_PREFERRED_CHUNK_SIZE,
1649
+ SPLITTER_MAX_CHUNK_SIZE
1650
+ );
1651
+ const greedySplitter = new GreedySplitter(
1652
+ semanticSplitter,
1653
+ SPLITTER_MIN_CHUNK_SIZE,
1654
+ SPLITTER_PREFERRED_CHUNK_SIZE
1655
+ );
1656
+ this.splitter = greedySplitter;
1657
+ }
1658
+ /**
1659
+ * Initializes the underlying document store.
1660
+ */
1661
+ async initialize() {
1662
+ await this.store.initialize();
1663
+ }
1664
+ /**
1665
+ * Shuts down the underlying document store.
1666
+ */
1667
+ async shutdown() {
1668
+ logger.debug("Shutting down store manager");
1669
+ await this.store.shutdown();
1670
+ }
1671
+ // Status tracking methods for pipeline integration
1672
+ /**
1673
+ * Gets versions by their current status.
1674
+ */
1675
+ async getVersionsByStatus(statuses) {
1676
+ return this.store.getVersionsByStatus(statuses);
1677
+ }
1678
+ /**
1679
+ * Updates the status of a version.
1680
+ */
1681
+ async updateVersionStatus(versionId, status, errorMessage) {
1682
+ return this.store.updateVersionStatus(versionId, status, errorMessage);
1683
+ }
1684
+ /**
1685
+ * Updates the progress of a version being indexed.
1686
+ */
1687
+ async updateVersionProgress(versionId, pages, maxPages) {
1688
+ return this.store.updateVersionProgress(versionId, pages, maxPages);
1689
+ }
1690
+ /**
1691
+ * Stores scraper options for a version to enable reproducible indexing.
1692
+ */
1693
+ async storeScraperOptions(versionId, options) {
1694
+ return this.store.storeScraperOptions(versionId, options);
1695
+ }
1696
+ /**
1697
+ * Retrieves stored scraper options for a version.
1698
+ */
1699
+ /**
1700
+ * Retrieves stored scraping configuration for a version.
1701
+ */
1702
+ async getScraperOptions(versionId) {
1703
+ return this.store.getScraperOptions(versionId);
1704
+ }
1705
+ /**
1706
+ * Ensures a library/version exists using a VersionRef and returns version ID.
1707
+ * Delegates to existing ensureLibraryAndVersion for storage.
1708
+ */
1709
+ async ensureVersion(ref) {
1710
+ const normalized = {
1711
+ library: ref.library.trim().toLowerCase(),
1712
+ version: (ref.version ?? "").trim().toLowerCase()
1713
+ };
1714
+ return this.ensureLibraryAndVersion(normalized.library, normalized.version);
1715
+ }
1716
+ /**
1717
+ * Returns enriched library summaries including version status/progress and counts.
1718
+ * Uses existing store APIs; keeps DB details encapsulated.
1719
+ */
1720
+ async listLibraries() {
1721
+ const libMap = await this.store.queryLibraryVersions();
1722
+ const summaries = [];
1723
+ for (const [library, versions] of libMap) {
1724
+ const vs = versions.map(
1725
+ (v) => ({
1726
+ id: v.versionId,
1727
+ ref: { library, version: v.version },
1728
+ status: v.status,
1729
+ // Include progress only while indexing is active; set undefined for COMPLETED
1730
+ progress: v.status === "completed" ? void 0 : { pages: v.progressPages, maxPages: v.progressMaxPages },
1731
+ counts: { documents: v.documentCount, uniqueUrls: v.uniqueUrlCount },
1732
+ indexedAt: v.indexedAt,
1733
+ sourceUrl: v.sourceUrl ?? void 0
1734
+ })
1735
+ );
1736
+ summaries.push({ library, versions: vs });
1737
+ }
1738
+ return summaries;
1739
+ }
1740
+ /**
1741
+ * Finds versions that were indexed from the same source URL.
1742
+ */
1743
+ async findVersionsBySourceUrl(url) {
1744
+ return this.store.findVersionsBySourceUrl(url);
1745
+ }
1746
+ /**
1747
+ * Validates if a library exists in the store (either versioned or unversioned).
1748
+ * Throws LibraryNotFoundError with suggestions if the library is not found.
1749
+ * @param library The name of the library to validate.
1750
+ * @throws {LibraryNotFoundError} If the library does not exist.
1751
+ */
1752
+ async validateLibraryExists(library) {
1753
+ logger.info(`🔎 Validating existence of library: ${library}`);
1754
+ const normalizedLibrary = library.toLowerCase();
1755
+ const versions = await this.listVersions(normalizedLibrary);
1756
+ const hasUnversioned = await this.exists(normalizedLibrary, "");
1757
+ if (versions.length === 0 && !hasUnversioned) {
1758
+ logger.warn(`âš ī¸ Library '${library}' not found.`);
1759
+ const allLibraries = await this.listLibraries();
1760
+ const libraryNames = allLibraries.map((lib) => lib.library);
1761
+ let suggestions = [];
1762
+ if (libraryNames.length > 0) {
1763
+ const fuse = new Fuse(libraryNames, {
1764
+ // Configure fuse.js options if needed (e.g., threshold)
1765
+ // isCaseSensitive: false, // Handled by normalizing library names
1766
+ // includeScore: true,
1767
+ threshold: 0.4
1768
+ // Adjust threshold for desired fuzziness (0=exact, 1=match anything)
1769
+ });
1770
+ const results = fuse.search(normalizedLibrary);
1771
+ suggestions = results.slice(0, 3).map((result) => result.item);
1772
+ logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
1773
+ }
1774
+ throw new LibraryNotFoundError(library, suggestions);
1775
+ }
1776
+ logger.info(`✅ Library '${library}' confirmed to exist.`);
1777
+ }
1778
+ /**
1779
+ * Returns a list of all available semantic versions for a library.
1780
+ */
1781
+ async listVersions(library) {
1782
+ const versions = await this.store.queryUniqueVersions(library);
1783
+ return versions.filter((v) => semver__default.valid(v));
1784
+ }
1785
+ /**
1786
+ * Checks if documents exist for a given library and optional version.
1787
+ * If version is omitted, checks for documents without a specific version.
1788
+ */
1789
+ async exists(library, version) {
1790
+ const normalizedVersion = this.normalizeVersion(version);
1791
+ return this.store.checkDocumentExists(library, normalizedVersion);
1792
+ }
1793
+ /**
1794
+ * Finds the most appropriate version of documentation based on the requested version.
1795
+ * When no target version is specified, returns the latest version.
1796
+ *
1797
+ * Version matching behavior:
1798
+ * - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
1799
+ * - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
1800
+ * - "latest" or no version: Returns the latest available version
1801
+ *
1802
+ * For documentation, we prefer matching older versions over no match at all,
1803
+ * since older docs are often still relevant and useful.
1804
+ * Also checks if unversioned documents exist for the library.
1805
+ */
1806
+ async findBestVersion(library, targetVersion) {
1807
+ const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
1808
+ logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
1809
+ const hasUnversioned = await this.store.checkDocumentExists(library, "");
1810
+ const versionStrings = await this.listVersions(library);
1811
+ if (versionStrings.length === 0) {
1812
+ if (hasUnversioned) {
1813
+ logger.info(`â„šī¸ Unversioned documents exist for ${library}`);
1814
+ return { bestMatch: null, hasUnversioned: true };
1815
+ }
1816
+ logger.warn(`âš ī¸ No valid versions found for ${library}`);
1817
+ const allLibraryDetails = await this.store.queryLibraryVersions();
1818
+ const libraryDetails = allLibraryDetails.get(library) ?? [];
1819
+ throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
1820
+ }
1821
+ let bestMatch = null;
1822
+ if (!targetVersion || targetVersion === "latest") {
1823
+ bestMatch = semver__default.maxSatisfying(versionStrings, "*");
1824
+ } else {
1825
+ const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
1826
+ if (!versionRegex.test(targetVersion)) {
1827
+ logger.warn(`âš ī¸ Invalid target version format: ${targetVersion}`);
1828
+ } else {
1829
+ let range = targetVersion;
1830
+ if (!semver__default.validRange(targetVersion)) {
1831
+ range = `~${targetVersion}`;
1832
+ } else if (semver__default.valid(targetVersion)) {
1833
+ range = `${range} || <=${targetVersion}`;
1834
+ }
1835
+ bestMatch = semver__default.maxSatisfying(versionStrings, range);
1836
+ }
1837
+ }
1838
+ if (bestMatch) {
1839
+ logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
1840
+ } else {
1841
+ logger.warn(`âš ī¸ No matching semver version found for ${libraryAndVersion}`);
1842
+ }
1843
+ if (!bestMatch && !hasUnversioned) {
1844
+ const allLibraryDetails = await this.store.queryLibraryVersions();
1845
+ const libraryDetails = allLibraryDetails.get(library) ?? [];
1846
+ throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
1847
+ }
1848
+ return { bestMatch, hasUnversioned };
1849
+ }
1850
+ /**
1851
+ * Removes all documents for a specific library and optional version.
1852
+ * If version is omitted, removes documents without a specific version.
1853
+ */
1854
+ async removeAllDocuments(library, version) {
1855
+ const normalizedVersion = this.normalizeVersion(version);
1856
+ logger.info(
1857
+ `đŸ—‘ī¸ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
1858
+ );
1859
+ const count = await this.store.deleteDocuments(library, normalizedVersion);
1860
+ logger.info(`📊 Deleted ${count} documents`);
1861
+ }
1862
+ /**
1863
+ * Adds a document to the store, splitting it into smaller chunks for better search results.
1864
+ * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
1865
+ * Preserves hierarchical structure of documents and distinguishes between text and code segments.
1866
+ * If version is omitted, the document is added without a specific version.
1867
+ */
1868
+ async addDocument(library, version, document) {
1869
+ const normalizedVersion = this.normalizeVersion(version);
1870
+ const url = document.metadata.url;
1871
+ if (!url || typeof url !== "string" || !url.trim()) {
1872
+ throw new StoreError("Document metadata must include a valid URL");
1873
+ }
1874
+ logger.info(`📚 Adding document: ${document.metadata.title}`);
1875
+ if (!document.pageContent.trim()) {
1876
+ throw new Error("Document content cannot be empty");
1877
+ }
1878
+ const chunks = await this.splitter.splitText(document.pageContent);
1879
+ const splitDocs = chunks.map((chunk) => ({
1880
+ pageContent: chunk.content,
1881
+ metadata: {
1882
+ ...document.metadata,
1883
+ level: chunk.section.level,
1884
+ path: chunk.section.path
1885
+ }
1886
+ }));
1887
+ logger.info(`âœ‚ī¸ Split document into ${splitDocs.length} chunks`);
1888
+ await this.store.addDocuments(library, normalizedVersion, splitDocs);
1889
+ }
1890
+ /**
1891
+ * Searches for documentation content across versions.
1892
+ * Uses hybrid search (vector + FTS).
1893
+ * If version is omitted, searches documents without a specific version.
1894
+ */
1895
+ async searchStore(library, version, query, limit = 5) {
1896
+ const normalizedVersion = this.normalizeVersion(version);
1897
+ return this.documentRetriever.search(library, normalizedVersion, query, limit);
1898
+ }
1899
+ // Deprecated simple listing removed: enriched listLibraries() is canonical
1900
+ /**
1901
+ * Ensures a library and version exist in the database and returns the version ID.
1902
+ * Creates the library and version records if they don't exist.
1903
+ */
1904
+ async ensureLibraryAndVersion(library, version) {
1905
+ const normalizedLibrary = library.toLowerCase();
1906
+ const normalizedVersion = this.normalizeVersion(version);
1907
+ const { versionId } = await this.store.resolveLibraryAndVersionIds(
1908
+ normalizedLibrary,
1909
+ normalizedVersion
1910
+ );
1911
+ return versionId;
1912
+ }
1913
+ }
1914
+ export {
1915
+ DocumentManagementService
1916
+ };
1917
+ //# sourceMappingURL=DocumentManagementService-BH02TJEe.js.map