@arabold/docs-mcp-server 1.21.1 → 1.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2026 +0,0 @@
1
- import fs from "node:fs";
2
- import path from "node:path";
3
- import envPaths from "env-paths";
4
- import Fuse from "fuse.js";
5
- import semver__default from "semver";
6
- import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
7
- import remarkGfm from "remark-gfm";
8
- import remarkHtml from "remark-html";
9
- import remarkParse from "remark-parse";
10
- import TurndownService from "turndown";
11
- import { unified } from "unified";
12
- import { l as logger, c as createJSDOM, V as VECTOR_DIMENSION, S as StoreError, D as DimensionError, a as applyMigrations, C as ConnectionError, d as denormalizeVersionName, n as normalizeVersionName, E as EMBEDDING_BATCH_CHARS, b as EMBEDDING_BATCH_SIZE, m as mapDbDocumentToDocument, g as getProjectRoot, e as SPLITTER_PREFERRED_CHUNK_SIZE, f as SPLITTER_MAX_CHUNK_SIZE, L as LibraryNotFoundError, h as VersionNotFoundError, i as analytics, T as TelemetryEvent, j as extractHostname, k as SPLITTER_MIN_CHUNK_SIZE } from "./index.js";
13
- import "node:crypto";
14
- import "cheerio";
15
- import "node:vm";
16
- import "jsdom";
17
- import "playwright";
18
- import "@joplin/turndown-plugin-gfm";
19
- import "iconv-lite";
20
- import Database from "better-sqlite3";
21
- import * as sqliteVec from "sqlite-vec";
22
- class SplitterError extends Error {
23
- }
24
- class MinimumChunkSizeError extends SplitterError {
25
- constructor(size, maxSize) {
26
- super(
27
- `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
28
- );
29
- }
30
- }
31
- class ContentSplitterError extends SplitterError {
32
- }
33
- class GreedySplitter {
34
- baseSplitter;
35
- minChunkSize;
36
- preferredChunkSize;
37
- /**
38
- * Combines a base document splitter with size constraints to produce optimally-sized chunks.
39
- * The base splitter handles the initial semantic splitting, while this class handles
40
- * the concatenation strategy.
41
- */
42
- constructor(baseSplitter, minChunkSize, preferredChunkSize) {
43
- this.baseSplitter = baseSplitter;
44
- this.minChunkSize = minChunkSize;
45
- this.preferredChunkSize = preferredChunkSize;
46
- }
47
- /**
48
- * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
49
- * are combined until they reach the minimum size, but splits are preserved at major
50
- * section boundaries to maintain document structure. This balances the need for
51
- * context with semantic coherence.
52
- */
53
- async splitText(markdown) {
54
- const initialChunks = await this.baseSplitter.splitText(markdown);
55
- const concatenatedChunks = [];
56
- let currentChunk = null;
57
- for (const nextChunk of initialChunks) {
58
- if (currentChunk) {
59
- if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
60
- concatenatedChunks.push(currentChunk);
61
- currentChunk = this.cloneChunk(nextChunk);
62
- continue;
63
- }
64
- if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
65
- concatenatedChunks.push(currentChunk);
66
- currentChunk = this.cloneChunk(nextChunk);
67
- continue;
68
- }
69
- currentChunk.content += `
70
- ${nextChunk.content}`;
71
- currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
72
- currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
73
- } else {
74
- currentChunk = this.cloneChunk(nextChunk);
75
- }
76
- }
77
- if (currentChunk) {
78
- concatenatedChunks.push(currentChunk);
79
- }
80
- return concatenatedChunks;
81
- }
82
- cloneChunk(chunk) {
83
- return {
84
- types: [...chunk.types],
85
- content: chunk.content,
86
- section: {
87
- level: chunk.section.level,
88
- path: [...chunk.section.path]
89
- }
90
- };
91
- }
92
- /**
93
- * H1 and H2 headings represent major conceptual breaks in the document.
94
- * Preserving these splits helps maintain the document's logical structure.
95
- */
96
- startsNewMajorSection(chunk) {
97
- return chunk.section.level === 1 || chunk.section.level === 2;
98
- }
99
- /**
100
- * Size limit check to ensure chunks remain within embedding model constraints.
101
- * Essential for maintaining consistent embedding quality and avoiding truncation.
102
- */
103
- wouldExceedMaxSize(currentChunk, nextChunk) {
104
- if (!currentChunk) {
105
- return false;
106
- }
107
- return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
108
- }
109
- /**
110
- * Checks if one path is a prefix of another path, indicating a parent-child relationship
111
- */
112
- isPathIncluded(parentPath, childPath) {
113
- if (parentPath.length >= childPath.length) return false;
114
- return parentPath.every((part, i) => part === childPath[i]);
115
- }
116
- /**
117
- * Merges section metadata when concatenating chunks, following these rules:
118
- * 1. Level: Always uses the lowest (most general) level between chunks
119
- * 2. Path selection:
120
- * - For parent-child relationships (one path includes the other), uses the child's path
121
- * - For siblings/unrelated sections, uses the common parent path
122
- * - If no common path exists, uses the root path ([])
123
- */
124
- mergeSectionInfo(currentChunk, nextChunk) {
125
- const level = Math.min(currentChunk.section.level, nextChunk.section.level);
126
- if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
127
- return currentChunk.section;
128
- }
129
- if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
130
- return {
131
- path: nextChunk.section.path,
132
- level
133
- };
134
- }
135
- if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
136
- return {
137
- path: currentChunk.section.path,
138
- level
139
- };
140
- }
141
- const commonPath = this.findCommonPrefix(
142
- currentChunk.section.path,
143
- nextChunk.section.path
144
- );
145
- return {
146
- path: commonPath,
147
- level
148
- };
149
- }
150
- mergeTypes(currentTypes, nextTypes) {
151
- return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
152
- }
153
- /**
154
- * Returns longest common prefix between two paths
155
- */
156
- findCommonPrefix(path1, path2) {
157
- const common = [];
158
- for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
159
- if (path1[i] === path2[i]) {
160
- common.push(path1[i]);
161
- } else {
162
- break;
163
- }
164
- }
165
- return common;
166
- }
167
- }
168
- const fullTrim = (str) => {
169
- return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
170
- };
171
- class CodeContentSplitter {
172
- constructor(options) {
173
- this.options = options;
174
- }
175
- async split(content) {
176
- const language = content.match(/^```(\w+)\n/)?.[1];
177
- const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
178
- const lines = strippedContent.split("\n");
179
- const chunks = [];
180
- let currentChunkLines = [];
181
- for (const line of lines) {
182
- const singleLineSize = this.wrap(line, language).length;
183
- if (singleLineSize > this.options.chunkSize) {
184
- throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
185
- }
186
- currentChunkLines.push(line);
187
- const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
188
- const newChunkSize = newChunkContent.length;
189
- if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
190
- const lastLine = currentChunkLines.pop();
191
- chunks.push(this.wrap(currentChunkLines.join("\n"), language));
192
- currentChunkLines = [lastLine];
193
- }
194
- }
195
- if (currentChunkLines.length > 0) {
196
- chunks.push(this.wrap(currentChunkLines.join("\n"), language));
197
- }
198
- return chunks;
199
- }
200
- wrap(content, language) {
201
- return `\`\`\`${language || ""}
202
- ${content.replace(/\n+$/, "")}
203
- \`\`\``;
204
- }
205
- }
206
- class TableContentSplitter {
207
- constructor(options) {
208
- this.options = options;
209
- }
210
- /**
211
- * Splits table content into chunks while preserving table structure
212
- */
213
- async split(content) {
214
- const parsedTable = this.parseTable(content);
215
- if (!parsedTable) {
216
- return [content];
217
- }
218
- const { headers, rows } = parsedTable;
219
- const chunks = [];
220
- let currentRows = [];
221
- for (const row of rows) {
222
- const singleRowSize = this.wrap(row, headers).length;
223
- if (singleRowSize > this.options.chunkSize) {
224
- throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
225
- }
226
- const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
227
- const newChunkSize = newChunkContent.length;
228
- if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
229
- chunks.push(this.wrap(currentRows.join("\n"), headers));
230
- currentRows = [row];
231
- } else {
232
- currentRows.push(row);
233
- }
234
- }
235
- if (currentRows.length > 0) {
236
- chunks.push(this.wrap(currentRows.join("\n"), headers));
237
- }
238
- return chunks;
239
- }
240
- wrap(content, headers) {
241
- const headerRow = `| ${headers.join(" | ")} |`;
242
- const separatorRow = `|${headers.map(() => "---").join("|")}|`;
243
- return [headerRow, separatorRow, content].join("\n");
244
- }
245
- parseTable(content) {
246
- const lines = content.trim().split("\n");
247
- if (lines.length < 3) return null;
248
- const headers = this.parseRow(lines[0]);
249
- if (!headers) return null;
250
- const separator = lines[1];
251
- if (!this.isValidSeparator(separator)) return null;
252
- const rows = lines.slice(2).filter((row) => row.trim() !== "");
253
- return { headers, separator, rows };
254
- }
255
- /**
256
- * Parses a table row into cells
257
- */
258
- parseRow(row) {
259
- if (!row.includes("|")) return null;
260
- return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
261
- }
262
- /**
263
- * Validates the separator row of the table
264
- */
265
- isValidSeparator(separator) {
266
- return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
267
- }
268
- }
269
- class TextContentSplitter {
270
- constructor(options) {
271
- this.options = options;
272
- }
273
- /**
274
- * Splits text content into chunks while trying to preserve semantic boundaries.
275
- * Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
276
- */
277
- async split(content) {
278
- const trimmedContent = fullTrim(content);
279
- if (trimmedContent.length <= this.options.chunkSize) {
280
- return [trimmedContent];
281
- }
282
- const words = trimmedContent.split(/\s+/);
283
- const longestWord = words.reduce(
284
- (max, word) => word.length > max.length ? word : max
285
- );
286
- if (longestWord.length > this.options.chunkSize) {
287
- throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
288
- }
289
- const paragraphChunks = this.splitByParagraphs(trimmedContent);
290
- if (this.areChunksValid(paragraphChunks)) {
291
- return paragraphChunks;
292
- }
293
- const lineChunks = this.splitByLines(trimmedContent);
294
- if (this.areChunksValid(lineChunks)) {
295
- return this.mergeChunks(lineChunks, "\n");
296
- }
297
- const wordChunks = await this.splitByWords(trimmedContent);
298
- return this.mergeChunks(wordChunks, " ");
299
- }
300
- /**
301
- * Checks if all chunks are within the maximum size limit
302
- */
303
- areChunksValid(chunks) {
304
- return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
305
- }
306
- /**
307
- * Splits text into chunks by paragraph boundaries (double newlines)
308
- */
309
- splitByParagraphs(text) {
310
- const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
311
- return paragraphs.filter((chunk) => chunk.length > 2);
312
- }
313
- /**
314
- * Splits text into chunks by line boundaries
315
- */
316
- splitByLines(text) {
317
- const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
318
- return lines.filter((chunk) => chunk.length > 1);
319
- }
320
- /**
321
- * Uses LangChain's recursive splitter for word-based splitting as a last resort
322
- */
323
- async splitByWords(text) {
324
- const splitter = new RecursiveCharacterTextSplitter({
325
- chunkSize: this.options.chunkSize,
326
- chunkOverlap: 0
327
- });
328
- const chunks = await splitter.splitText(text);
329
- return chunks;
330
- }
331
- /**
332
- * Attempts to merge small chunks with previous chunks to minimize fragmentation.
333
- * Only merges if combined size is within maxChunkSize.
334
- */
335
- mergeChunks(chunks, separator) {
336
- const mergedChunks = [];
337
- let currentChunk = null;
338
- for (const chunk of chunks) {
339
- if (currentChunk === null) {
340
- currentChunk = chunk;
341
- continue;
342
- }
343
- const currentChunkSize = this.getChunkSize(currentChunk);
344
- const nextChunkSize = this.getChunkSize(chunk);
345
- if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
346
- currentChunk = `${currentChunk}${separator}${chunk}`;
347
- } else {
348
- mergedChunks.push(currentChunk);
349
- currentChunk = chunk;
350
- }
351
- }
352
- if (currentChunk) {
353
- mergedChunks.push(currentChunk);
354
- }
355
- return mergedChunks;
356
- }
357
- getChunkSize(chunk) {
358
- return chunk.length;
359
- }
360
- wrap(content) {
361
- return content;
362
- }
363
- }
364
- class SemanticMarkdownSplitter {
365
- constructor(preferredChunkSize, maxChunkSize) {
366
- this.preferredChunkSize = preferredChunkSize;
367
- this.maxChunkSize = maxChunkSize;
368
- this.turndownService = new TurndownService({
369
- headingStyle: "atx",
370
- hr: "---",
371
- bulletListMarker: "-",
372
- codeBlockStyle: "fenced",
373
- emDelimiter: "_",
374
- strongDelimiter: "**",
375
- linkStyle: "inlined"
376
- });
377
- this.turndownService.addRule("table", {
378
- filter: ["table"],
379
- replacement: (_content, node) => {
380
- const table = node;
381
- const headers = Array.from(table.querySelectorAll("th")).map(
382
- (th) => th.textContent?.trim() || ""
383
- );
384
- const rows = Array.from(table.querySelectorAll("tr")).filter(
385
- (tr) => !tr.querySelector("th")
386
- );
387
- if (headers.length === 0 && rows.length === 0) return "";
388
- let markdown = "\n";
389
- if (headers.length > 0) {
390
- markdown += `| ${headers.join(" | ")} |
391
- `;
392
- markdown += `|${headers.map(() => "---").join("|")}|
393
- `;
394
- }
395
- for (const row of rows) {
396
- const cells = Array.from(row.querySelectorAll("td")).map(
397
- (td) => td.textContent?.trim() || ""
398
- );
399
- markdown += `| ${cells.join(" | ")} |
400
- `;
401
- }
402
- return markdown;
403
- }
404
- });
405
- this.textSplitter = new TextContentSplitter({
406
- chunkSize: this.preferredChunkSize
407
- });
408
- this.codeSplitter = new CodeContentSplitter({
409
- chunkSize: this.maxChunkSize
410
- });
411
- this.tableSplitter = new TableContentSplitter({
412
- chunkSize: this.maxChunkSize
413
- });
414
- }
415
- turndownService;
416
- textSplitter;
417
- codeSplitter;
418
- tableSplitter;
419
- /**
420
- * Main entry point for splitting markdown content
421
- */
422
- async splitText(markdown) {
423
- const html = await this.markdownToHtml(markdown);
424
- const dom = await this.parseHtml(html);
425
- const sections = await this.splitIntoSections(dom);
426
- return this.splitSectionContent(sections);
427
- }
428
- /**
429
- * Step 1: Split document into sections based on H1-H6 headings,
430
- * as well as code blocks and tables.
431
- */
432
- async splitIntoSections(dom) {
433
- const body = dom.querySelector("body");
434
- if (!body) {
435
- throw new Error("Invalid HTML structure: no body element found");
436
- }
437
- let currentSection = this.createRootSection();
438
- const sections = [];
439
- const stack = [currentSection];
440
- for (const element of Array.from(body.children)) {
441
- const headingMatch = element.tagName.match(/H([1-6])/);
442
- if (headingMatch) {
443
- const level = Number.parseInt(headingMatch[1], 10);
444
- const title = fullTrim(element.textContent || "");
445
- while (stack.length > 1 && stack[stack.length - 1].level >= level) {
446
- stack.pop();
447
- }
448
- currentSection = {
449
- level,
450
- path: [
451
- ...stack.slice(1).reduce((acc, s) => {
452
- const lastPath = s.path[s.path.length - 1];
453
- if (lastPath) acc.push(lastPath);
454
- return acc;
455
- }, []),
456
- title
457
- ],
458
- content: [
459
- {
460
- type: "heading",
461
- text: `${"#".repeat(level)} ${title}`
462
- }
463
- ]
464
- };
465
- sections.push(currentSection);
466
- stack.push(currentSection);
467
- } else if (element.tagName === "PRE") {
468
- const code = element.querySelector("code");
469
- const language = code?.className.replace("language-", "") || "";
470
- const content = code?.textContent || element.textContent || "";
471
- const markdown = `${"```"}${language}
472
- ${content}
473
- ${"```"}`;
474
- currentSection = {
475
- level: currentSection.level,
476
- path: currentSection.path,
477
- content: [
478
- {
479
- type: "code",
480
- text: markdown
481
- }
482
- ]
483
- };
484
- sections.push(currentSection);
485
- } else if (element.tagName === "TABLE") {
486
- const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
487
- currentSection = {
488
- level: currentSection.level,
489
- path: currentSection.path,
490
- content: [
491
- {
492
- type: "table",
493
- text: markdown
494
- }
495
- ]
496
- };
497
- sections.push(currentSection);
498
- } else {
499
- const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
500
- if (markdown) {
501
- currentSection = {
502
- level: currentSection.level,
503
- path: currentSection.path,
504
- content: [
505
- {
506
- type: "text",
507
- text: markdown
508
- }
509
- ]
510
- };
511
- sections.push(currentSection);
512
- }
513
- }
514
- }
515
- return sections;
516
- }
517
- /**
518
- * Step 2: Split section content into smaller chunks
519
- */
520
- async splitSectionContent(sections) {
521
- const chunks = [];
522
- for (const section of sections) {
523
- for (const content of section.content) {
524
- let splitContent = [];
525
- try {
526
- switch (content.type) {
527
- case "heading":
528
- case "text": {
529
- splitContent = await this.textSplitter.split(content.text);
530
- break;
531
- }
532
- case "code": {
533
- splitContent = await this.codeSplitter.split(content.text);
534
- break;
535
- }
536
- case "table": {
537
- splitContent = await this.tableSplitter.split(content.text);
538
- break;
539
- }
540
- }
541
- } catch (err) {
542
- if (err instanceof MinimumChunkSizeError) {
543
- logger.warn(
544
- `⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
545
- );
546
- const splitter = new RecursiveCharacterTextSplitter({
547
- chunkSize: this.maxChunkSize,
548
- chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
549
- // Use more aggressive separators including empty string as last resort
550
- separators: [
551
- "\n\n",
552
- "\n",
553
- " ",
554
- " ",
555
- ".",
556
- ",",
557
- ";",
558
- ":",
559
- "-",
560
- "(",
561
- ")",
562
- "[",
563
- "]",
564
- "{",
565
- "}",
566
- ""
567
- ]
568
- });
569
- const chunks2 = await splitter.splitText(content.text);
570
- if (chunks2.length === 0) {
571
- splitContent = [content.text.substring(0, this.maxChunkSize)];
572
- } else {
573
- splitContent = chunks2;
574
- }
575
- } else {
576
- const errMessage = err instanceof Error ? err.message : String(err);
577
- throw new ContentSplitterError(
578
- `Failed to split ${content.type} content: ${errMessage}`
579
- );
580
- }
581
- }
582
- chunks.push(
583
- ...splitContent.map(
584
- (text) => ({
585
- types: [content.type],
586
- content: text,
587
- section: {
588
- level: section.level,
589
- path: section.path
590
- }
591
- })
592
- )
593
- );
594
- }
595
- }
596
- return chunks;
597
- }
598
- /**
599
- * Helper to create the root section
600
- */
601
- createRootSection() {
602
- return {
603
- level: 0,
604
- path: [],
605
- content: []
606
- };
607
- }
608
- /**
609
- * Convert markdown to HTML using remark
610
- */
611
- async markdownToHtml(markdown) {
612
- const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
613
- return `<!DOCTYPE html>
614
- <html>
615
- <body>
616
- ${String(html)}
617
- </body>
618
- </html>`;
619
- }
620
- /**
621
- * Parse HTML
622
- */
623
- async parseHtml(html) {
624
- const { window } = createJSDOM(html);
625
- return window.document;
626
- }
627
- }
628
- const CHILD_LIMIT = 5;
629
- const SIBLING_LIMIT = 2;
630
- class DocumentRetrieverService {
631
- documentStore;
632
- constructor(documentStore) {
633
- this.documentStore = documentStore;
634
- }
635
- /**
636
- * Collects all related chunk IDs for a given initial hit.
637
- * Returns an object with url, hitId, relatedIds (Set), and score.
638
- */
639
- async getRelatedChunkIds(library, version, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
640
- const id = doc.id;
641
- const url = doc.metadata.url;
642
- const score = doc.metadata.score;
643
- const relatedIds = /* @__PURE__ */ new Set();
644
- relatedIds.add(id);
645
- const parent = await this.documentStore.findParentChunk(library, version, id);
646
- if (parent) {
647
- relatedIds.add(parent.id);
648
- }
649
- const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
650
- library,
651
- version,
652
- id,
653
- siblingLimit
654
- );
655
- for (const sib of precedingSiblings) {
656
- relatedIds.add(sib.id);
657
- }
658
- const childChunks = await this.documentStore.findChildChunks(
659
- library,
660
- version,
661
- id,
662
- childLimit
663
- );
664
- for (const child of childChunks) {
665
- relatedIds.add(child.id);
666
- }
667
- const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
668
- library,
669
- version,
670
- id,
671
- siblingLimit
672
- );
673
- for (const sib of subsequentSiblings) {
674
- relatedIds.add(sib.id);
675
- }
676
- return { url, hitId: id, relatedIds, score };
677
- }
678
- /**
679
- * Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
680
- */
681
- groupAndPrepareFetch(relatedInfos) {
682
- const urlMap = /* @__PURE__ */ new Map();
683
- for (const info of relatedInfos) {
684
- let entry = urlMap.get(info.url);
685
- if (!entry) {
686
- entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
687
- urlMap.set(info.url, entry);
688
- }
689
- for (const id of info.relatedIds) {
690
- entry.uniqueChunkIds.add(id);
691
- }
692
- if (info.score > entry.maxScore) {
693
- entry.maxScore = info.score;
694
- }
695
- }
696
- return urlMap;
697
- }
698
- /**
699
- * Finalizes the merged result for a URL group by fetching, sorting, and joining content.
700
- */
701
- async finalizeResult(library, version, url, uniqueChunkIds, maxScore) {
702
- const ids = Array.from(uniqueChunkIds);
703
- const docs = await this.documentStore.findChunksByIds(library, version, ids);
704
- const content = docs.map((d) => d.pageContent).join("\n\n");
705
- return {
706
- url,
707
- content,
708
- score: maxScore
709
- };
710
- }
711
- /**
712
- * Searches for documents and expands the context around the matches.
713
- * @param library The library name.
714
- * @param version The library version.
715
- * @param query The search query.
716
- * @param version The library version (optional, defaults to searching documents without a version).
717
- * @param query The search query.
718
- * @param limit The optional limit for the initial search results.
719
- * @returns An array of strings representing the aggregated content of the retrieved chunks.
720
- */
721
- async search(library, version, query, limit) {
722
- const normalizedVersion = (version ?? "").toLowerCase();
723
- const initialResults = await this.documentStore.findByContent(
724
- library,
725
- normalizedVersion,
726
- query,
727
- limit ?? 10
728
- );
729
- const relatedInfos = await Promise.all(
730
- initialResults.map(
731
- (doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
732
- )
733
- );
734
- const urlMap = this.groupAndPrepareFetch(relatedInfos);
735
- const results = [];
736
- for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
737
- const result = await this.finalizeResult(
738
- library,
739
- normalizedVersion,
740
- url,
741
- uniqueChunkIds,
742
- maxScore
743
- );
744
- results.push(result);
745
- }
746
- return results;
747
- }
748
- }
749
- class DocumentStore {
750
- db;
751
- embeddings;
752
- dbDimension = VECTOR_DIMENSION;
753
- modelDimension;
754
- statements;
755
- /**
756
- * Calculates Reciprocal Rank Fusion score for a result
757
- */
758
- calculateRRF(vecRank, ftsRank, k = 60) {
759
- let rrf = 0;
760
- if (vecRank !== void 0) {
761
- rrf += 1 / (k + vecRank);
762
- }
763
- if (ftsRank !== void 0) {
764
- rrf += 1 / (k + ftsRank);
765
- }
766
- return rrf;
767
- }
768
- /**
769
- * Assigns ranks to search results based on their scores
770
- */
771
- assignRanks(results) {
772
- const vecRanks = /* @__PURE__ */ new Map();
773
- const ftsRanks = /* @__PURE__ */ new Map();
774
- results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
775
- vecRanks.set(Number(result.id), index + 1);
776
- });
777
- results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
778
- ftsRanks.set(Number(result.id), index + 1);
779
- });
780
- return results.map((result) => ({
781
- ...result,
782
- vec_rank: vecRanks.get(Number(result.id)),
783
- fts_rank: ftsRanks.get(Number(result.id)),
784
- rrf_score: this.calculateRRF(
785
- vecRanks.get(Number(result.id)),
786
- ftsRanks.get(Number(result.id))
787
- )
788
- }));
789
- }
790
- constructor(dbPath) {
791
- if (!dbPath) {
792
- throw new StoreError("Missing required database path");
793
- }
794
- this.db = new Database(dbPath);
795
- }
796
- /**
797
- * Sets up prepared statements for database queries
798
- */
799
- prepareStatements() {
800
- const statements = {
801
- getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
802
- insertDocument: this.db.prepare(
803
- "INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
804
- ),
805
- insertEmbedding: this.db.prepare(
806
- "INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
807
- ),
808
- insertLibrary: this.db.prepare(
809
- "INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
810
- ),
811
- getLibraryIdByName: this.db.prepare(
812
- "SELECT id FROM libraries WHERE name = ?"
813
- ),
814
- // New version-related statements
815
- insertVersion: this.db.prepare(
816
- "INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
817
- ),
818
- resolveVersionId: this.db.prepare(
819
- "SELECT id FROM versions WHERE library_id = ? AND name IS ?"
820
- ),
821
- getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
822
- queryVersionsByLibraryId: this.db.prepare(
823
- "SELECT * FROM versions WHERE library_id = ? ORDER BY name"
824
- ),
825
- deleteLibraryDocuments: this.db.prepare(
826
- `DELETE FROM documents
827
- WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
828
- AND version_id = (
829
- SELECT v.id FROM versions v
830
- WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
831
- AND COALESCE(v.name, '') = COALESCE(?, '')
832
- )`
833
- ),
834
- deleteDocuments: this.db.prepare(
835
- `DELETE FROM documents
836
- WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
837
- AND version_id = (
838
- SELECT v.id FROM versions v
839
- WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
840
- AND COALESCE(v.name, '') = COALESCE(?, '')
841
- )`
842
- ),
843
- deleteDocumentsByUrl: this.db.prepare(
844
- `DELETE FROM documents
845
- WHERE url = ?
846
- AND library_id = (SELECT id FROM libraries WHERE name = ?)
847
- AND version_id = (
848
- SELECT v.id FROM versions v
849
- WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
850
- AND COALESCE(v.name, '') = COALESCE(?, '')
851
- )`
852
- ),
853
- getDocumentBySort: this.db.prepare(
854
- `SELECT d.id
855
- FROM documents d
856
- JOIN versions v ON d.version_id = v.id
857
- JOIN libraries l ON v.library_id = l.id
858
- WHERE l.name = ?
859
- AND COALESCE(v.name, '') = COALESCE(?, '')
860
- LIMIT 1`
861
- ),
862
- queryVersions: this.db.prepare(
863
- `SELECT DISTINCT v.name
864
- FROM versions v
865
- JOIN libraries l ON v.library_id = l.id
866
- WHERE l.name = ?
867
- ORDER BY v.name`
868
- ),
869
- checkExists: this.db.prepare(
870
- `SELECT d.id FROM documents d
871
- JOIN versions v ON d.version_id = v.id
872
- JOIN libraries l ON v.library_id = l.id
873
- WHERE l.name = ?
874
- AND COALESCE(v.name, '') = COALESCE(?, '')
875
- LIMIT 1`
876
- ),
877
- // Library/version aggregation including versions without documents and status/progress fields
878
- queryLibraryVersions: this.db.prepare(
879
- `SELECT
880
- l.name as library,
881
- COALESCE(v.name, '') as version,
882
- v.id as versionId,
883
- v.status as status,
884
- v.progress_pages as progressPages,
885
- v.progress_max_pages as progressMaxPages,
886
- v.source_url as sourceUrl,
887
- MIN(d.indexed_at) as indexedAt,
888
- COUNT(d.id) as documentCount,
889
- COUNT(DISTINCT d.url) as uniqueUrlCount
890
- FROM versions v
891
- JOIN libraries l ON v.library_id = l.id
892
- LEFT JOIN documents d ON d.version_id = v.id
893
- GROUP BY v.id
894
- ORDER BY l.name, version`
895
- ),
896
- getChildChunks: this.db.prepare(`
897
- SELECT d.* FROM documents d
898
- JOIN versions v ON d.version_id = v.id
899
- JOIN libraries l ON v.library_id = l.id
900
- WHERE l.name = ?
901
- AND COALESCE(v.name, '') = COALESCE(?, '')
902
- AND d.url = ?
903
- AND json_array_length(json_extract(d.metadata, '$.path')) = ?
904
- AND json_extract(d.metadata, '$.path') LIKE ? || '%'
905
- AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
906
- ORDER BY d.sort_order
907
- LIMIT ?
908
- `),
909
- getPrecedingSiblings: this.db.prepare(`
910
- SELECT d.* FROM documents d
911
- JOIN versions v ON d.version_id = v.id
912
- JOIN libraries l ON v.library_id = l.id
913
- WHERE l.name = ?
914
- AND COALESCE(v.name, '') = COALESCE(?, '')
915
- AND d.url = ?
916
- AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
917
- AND json_extract(d.metadata, '$.path') = ?
918
- ORDER BY d.sort_order DESC
919
- LIMIT ?
920
- `),
921
- getSubsequentSiblings: this.db.prepare(`
922
- SELECT d.* FROM documents d
923
- JOIN versions v ON d.version_id = v.id
924
- JOIN libraries l ON v.library_id = l.id
925
- WHERE l.name = ?
926
- AND COALESCE(v.name, '') = COALESCE(?, '')
927
- AND d.url = ?
928
- AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
929
- AND json_extract(d.metadata, '$.path') = ?
930
- ORDER BY d.sort_order
931
- LIMIT ?
932
- `),
933
- getParentChunk: this.db.prepare(`
934
- SELECT d.* FROM documents d
935
- JOIN versions v ON d.version_id = v.id
936
- JOIN libraries l ON v.library_id = l.id
937
- WHERE l.name = ?
938
- AND COALESCE(v.name, '') = COALESCE(?, '')
939
- AND d.url = ?
940
- AND json_extract(d.metadata, '$.path') = ?
941
- AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
942
- ORDER BY d.sort_order DESC
943
- LIMIT 1
944
- `),
945
- // Status tracking statements
946
- updateVersionStatus: this.db.prepare(
947
- "UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
948
- ),
949
- updateVersionProgress: this.db.prepare(
950
- "UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
951
- ),
952
- getVersionsByStatus: this.db.prepare(
953
- "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
954
- ),
955
- // Scraper options statements
956
- updateVersionScraperOptions: this.db.prepare(
957
- "UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
958
- ),
959
- getVersionWithOptions: this.db.prepare(
960
- "SELECT * FROM versions WHERE id = ?"
961
- ),
962
- getVersionsBySourceUrl: this.db.prepare(
963
- "SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
964
- ),
965
- // Version and library deletion statements
966
- deleteVersionById: this.db.prepare("DELETE FROM versions WHERE id = ?"),
967
- deleteLibraryById: this.db.prepare("DELETE FROM libraries WHERE id = ?"),
968
- countVersionsByLibraryId: this.db.prepare(
969
- "SELECT COUNT(*) as count FROM versions WHERE library_id = ?"
970
- ),
971
- getVersionId: this.db.prepare(
972
- `SELECT v.id, v.library_id FROM versions v
973
- JOIN libraries l ON v.library_id = l.id
974
- WHERE l.name = ? AND COALESCE(v.name, '') = COALESCE(?, '')`
975
- )
976
- };
977
- this.statements = statements;
978
- }
979
- /**
980
- * Pads a vector to the fixed database dimension by appending zeros.
981
- * Throws an error if the input vector is longer than the database dimension.
982
- */
983
- padVector(vector) {
984
- if (vector.length > this.dbDimension) {
985
- throw new Error(
986
- `Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
987
- );
988
- }
989
- if (vector.length === this.dbDimension) {
990
- return vector;
991
- }
992
- return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
993
- }
994
- /**
995
- * Initializes embeddings client using environment variables for configuration.
996
- *
997
- * The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
998
- * Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
999
- * for OpenAI (default).
1000
- *
1001
- * Supported providers and their required environment variables:
1002
- * - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
1003
- * - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
1004
- * - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
1005
- * - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
1006
- */
1007
- async initializeEmbeddings() {
1008
- const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
1009
- const { createEmbeddingModel } = await import("./EmbeddingFactory-CElwVk3X.js");
1010
- this.embeddings = createEmbeddingModel(modelSpec);
1011
- const testVector = await this.embeddings.embedQuery("test");
1012
- this.modelDimension = testVector.length;
1013
- if (this.modelDimension > this.dbDimension) {
1014
- throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
1015
- }
1016
- }
1017
- /**
1018
- * Escapes a query string for use with SQLite FTS5 MATCH operator.
1019
- * Wraps the query in double quotes and escapes internal double quotes.
1020
- */
1021
- escapeFtsQuery(query) {
1022
- const escapedQuotes = query.replace(/"/g, '""');
1023
- return `"${escapedQuotes}"`;
1024
- }
1025
- /**
1026
- * Initializes database connection and ensures readiness
1027
- */
1028
- async initialize() {
1029
- try {
1030
- sqliteVec.load(this.db);
1031
- applyMigrations(this.db);
1032
- this.prepareStatements();
1033
- await this.initializeEmbeddings();
1034
- } catch (error) {
1035
- if (error instanceof StoreError) {
1036
- throw error;
1037
- }
1038
- throw new ConnectionError("Failed to initialize database connection", error);
1039
- }
1040
- }
1041
- /**
1042
- * Gracefully closes database connections
1043
- */
1044
- async shutdown() {
1045
- this.db.close();
1046
- }
1047
- /**
1048
- * Resolves a library name and version string to library_id and version_id.
1049
- * Creates library and version records if they don't exist.
1050
- */
1051
- async resolveLibraryAndVersionIds(library, version) {
1052
- const normalizedLibrary = library.toLowerCase();
1053
- const normalizedVersion = denormalizeVersionName(version.toLowerCase());
1054
- this.statements.insertLibrary.run(normalizedLibrary);
1055
- const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
1056
- if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
1057
- throw new StoreError(`Failed to resolve library_id for library: ${library}`);
1058
- }
1059
- const libraryId = libraryIdRow.id;
1060
- this.statements.insertVersion.run(libraryId, normalizedVersion);
1061
- const versionIdRow = this.statements.resolveVersionId.get(
1062
- libraryId,
1063
- normalizedVersion === null ? "" : normalizedVersion
1064
- );
1065
- if (!versionIdRow || typeof versionIdRow.id !== "number") {
1066
- throw new StoreError(
1067
- `Failed to resolve version_id for library: ${library}, version: ${version}`
1068
- );
1069
- }
1070
- return { libraryId, versionId: versionIdRow.id };
1071
- }
1072
- /**
1073
- * Retrieves all unique versions for a specific library
1074
- */
1075
- async queryUniqueVersions(library) {
1076
- try {
1077
- const rows = this.statements.queryVersions.all(library.toLowerCase());
1078
- return rows.map((row) => normalizeVersionName(row.name));
1079
- } catch (error) {
1080
- throw new ConnectionError("Failed to query versions", error);
1081
- }
1082
- }
1083
- /**
1084
- * Updates the status of a version record in the database.
1085
- * @param versionId The version ID to update
1086
- * @param status The new status to set
1087
- * @param errorMessage Optional error message for failed statuses
1088
- */
1089
- async updateVersionStatus(versionId, status, errorMessage) {
1090
- try {
1091
- this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
1092
- } catch (error) {
1093
- throw new StoreError(`Failed to update version status: ${error}`);
1094
- }
1095
- }
1096
- /**
1097
- * Updates the progress counters for a version being indexed.
1098
- * @param versionId The version ID to update
1099
- * @param pages Current number of pages processed
1100
- * @param maxPages Total number of pages to process
1101
- */
1102
- async updateVersionProgress(versionId, pages, maxPages) {
1103
- try {
1104
- this.statements.updateVersionProgress.run(pages, maxPages, versionId);
1105
- } catch (error) {
1106
- throw new StoreError(`Failed to update version progress: ${error}`);
1107
- }
1108
- }
1109
- /**
1110
- * Retrieves versions by their status.
1111
- * @param statuses Array of statuses to filter by
1112
- * @returns Array of version records matching the statuses
1113
- */
1114
- async getVersionsByStatus(statuses) {
1115
- try {
1116
- const statusJson = JSON.stringify(statuses);
1117
- const rows = this.statements.getVersionsByStatus.all(
1118
- statusJson
1119
- );
1120
- return rows;
1121
- } catch (error) {
1122
- throw new StoreError(`Failed to get versions by status: ${error}`);
1123
- }
1124
- }
1125
- /**
1126
- * Stores scraper options for a version to enable reproducible indexing.
1127
- * @param versionId The version ID to update
1128
- * @param options Complete scraper options used for indexing
1129
- */
1130
- async storeScraperOptions(versionId, options) {
1131
- try {
1132
- const { url: source_url, library, version, signal, ...scraper_options } = options;
1133
- const optionsJson = JSON.stringify(scraper_options);
1134
- this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
1135
- } catch (error) {
1136
- throw new StoreError(`Failed to store scraper options: ${error}`);
1137
- }
1138
- }
1139
- /**
1140
- * Retrieves stored scraping configuration (source URL and options) for a version.
1141
- * Returns null when no source URL is recorded (not re-indexable).
1142
- */
1143
- async getScraperOptions(versionId) {
1144
- try {
1145
- const row = this.statements.getVersionWithOptions.get(versionId);
1146
- if (!row?.source_url) {
1147
- return null;
1148
- }
1149
- let parsed = {};
1150
- if (row.scraper_options) {
1151
- try {
1152
- parsed = JSON.parse(row.scraper_options);
1153
- } catch (e) {
1154
- logger.warn(`⚠️ Invalid scraper_options JSON for version ${versionId}: ${e}`);
1155
- parsed = {};
1156
- }
1157
- }
1158
- return { sourceUrl: row.source_url, options: parsed };
1159
- } catch (error) {
1160
- throw new StoreError(`Failed to get scraper options: ${error}`);
1161
- }
1162
- }
1163
- /**
1164
- * Finds versions that were indexed from the same source URL.
1165
- * Useful for finding similar configurations or detecting duplicates.
1166
- * @param url Source URL to search for
1167
- * @returns Array of versions with the same source URL
1168
- */
1169
- async findVersionsBySourceUrl(url) {
1170
- try {
1171
- const rows = this.statements.getVersionsBySourceUrl.all(
1172
- url
1173
- );
1174
- return rows;
1175
- } catch (error) {
1176
- throw new StoreError(`Failed to find versions by source URL: ${error}`);
1177
- }
1178
- }
1179
- /**
1180
- * Verifies existence of documents for a specific library version
1181
- */
1182
- async checkDocumentExists(library, version) {
1183
- try {
1184
- const normalizedVersion = version.toLowerCase();
1185
- const result = this.statements.checkExists.get(
1186
- library.toLowerCase(),
1187
- normalizedVersion
1188
- );
1189
- return result !== void 0;
1190
- } catch (error) {
1191
- throw new ConnectionError("Failed to check document existence", error);
1192
- }
1193
- }
1194
- /**
1195
- * Retrieves a mapping of all libraries to their available versions with details.
1196
- */
1197
- async queryLibraryVersions() {
1198
- try {
1199
- const rows = this.statements.queryLibraryVersions.all();
1200
- const libraryMap = /* @__PURE__ */ new Map();
1201
- for (const row of rows) {
1202
- const library = row.library;
1203
- if (!libraryMap.has(library)) {
1204
- libraryMap.set(library, []);
1205
- }
1206
- const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
1207
- libraryMap.get(library)?.push({
1208
- version: row.version,
1209
- versionId: row.versionId,
1210
- // Preserve raw string status here; DocumentManagementService will cast to VersionStatus
1211
- status: row.status,
1212
- progressPages: row.progressPages,
1213
- progressMaxPages: row.progressMaxPages,
1214
- sourceUrl: row.sourceUrl,
1215
- documentCount: row.documentCount,
1216
- uniqueUrlCount: row.uniqueUrlCount,
1217
- indexedAt: indexedAtISO
1218
- });
1219
- }
1220
- for (const versions of libraryMap.values()) {
1221
- versions.sort((a, b) => {
1222
- if (a.version === "" && b.version !== "") {
1223
- return -1;
1224
- }
1225
- if (a.version !== "" && b.version === "") {
1226
- return 1;
1227
- }
1228
- if (a.version === "" && b.version === "") {
1229
- return 0;
1230
- }
1231
- try {
1232
- return semver__default.compare(a.version, b.version);
1233
- } catch (_error) {
1234
- return a.version.localeCompare(b.version);
1235
- }
1236
- });
1237
- }
1238
- return libraryMap;
1239
- } catch (error) {
1240
- throw new ConnectionError("Failed to query library versions", error);
1241
- }
1242
- }
1243
- /**
1244
- * Stores documents with library and version metadata, generating embeddings
1245
- * for vector similarity search. Automatically removes any existing documents
1246
- * for the same URLs before adding new ones to prevent UNIQUE constraint violations.
1247
- */
1248
- async addDocuments(library, version, documents) {
1249
- try {
1250
- if (documents.length === 0) {
1251
- return;
1252
- }
1253
- const urls = /* @__PURE__ */ new Set();
1254
- for (const doc of documents) {
1255
- const url = doc.metadata.url;
1256
- if (!url || typeof url !== "string" || !url.trim()) {
1257
- throw new StoreError("Document metadata must include a valid URL");
1258
- }
1259
- urls.add(url);
1260
- }
1261
- const texts = documents.map((doc) => {
1262
- const header = `<title>${doc.metadata.title}</title>
1263
- <url>${doc.metadata.url}</url>
1264
- <path>${doc.metadata.path.join(" / ")}</path>
1265
- `;
1266
- return `${header}${doc.pageContent}`;
1267
- });
1268
- const maxBatchChars = Number(process.env.DOCS_MCP_EMBEDDING_BATCH_CHARS) || EMBEDDING_BATCH_CHARS;
1269
- const rawEmbeddings = [];
1270
- let currentBatch = [];
1271
- let currentBatchSize = 0;
1272
- let batchCount = 0;
1273
- for (const text of texts) {
1274
- const textSize = text.length;
1275
- if (currentBatchSize + textSize > maxBatchChars && currentBatch.length > 0) {
1276
- batchCount++;
1277
- logger.debug(
1278
- `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
1279
- );
1280
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
1281
- rawEmbeddings.push(...batchEmbeddings);
1282
- currentBatch = [];
1283
- currentBatchSize = 0;
1284
- }
1285
- currentBatch.push(text);
1286
- currentBatchSize += textSize;
1287
- if (currentBatch.length >= EMBEDDING_BATCH_SIZE) {
1288
- batchCount++;
1289
- logger.debug(
1290
- `🔄 Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
1291
- );
1292
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
1293
- rawEmbeddings.push(...batchEmbeddings);
1294
- currentBatch = [];
1295
- currentBatchSize = 0;
1296
- }
1297
- }
1298
- if (currentBatch.length > 0) {
1299
- batchCount++;
1300
- logger.debug(
1301
- `🔄 Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
1302
- );
1303
- const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
1304
- rawEmbeddings.push(...batchEmbeddings);
1305
- }
1306
- const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
1307
- const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
1308
- library,
1309
- version
1310
- );
1311
- for (const url of urls) {
1312
- const deletedCount = await this.deleteDocumentsByUrl(library, version, url);
1313
- if (deletedCount > 0) {
1314
- logger.debug(`Deleted ${deletedCount} existing documents for URL: ${url}`);
1315
- }
1316
- }
1317
- const transaction = this.db.transaction((docs) => {
1318
- for (let i = 0; i < docs.length; i++) {
1319
- const doc = docs[i];
1320
- const url = doc.metadata.url;
1321
- const result = this.statements.insertDocument.run(
1322
- BigInt(libraryId),
1323
- BigInt(versionId),
1324
- url,
1325
- doc.pageContent,
1326
- JSON.stringify(doc.metadata),
1327
- i,
1328
- (/* @__PURE__ */ new Date()).toISOString()
1329
- // Pass current timestamp for indexed_at
1330
- );
1331
- const rowId = result.lastInsertRowid;
1332
- this.statements.insertEmbedding.run(
1333
- BigInt(rowId),
1334
- BigInt(libraryId),
1335
- BigInt(versionId),
1336
- JSON.stringify(paddedEmbeddings[i])
1337
- );
1338
- }
1339
- });
1340
- transaction(documents);
1341
- } catch (error) {
1342
- throw new ConnectionError("Failed to add documents to store", error);
1343
- }
1344
- }
1345
- /**
1346
- * Removes documents matching specified library and version
1347
- * @returns Number of documents deleted
1348
- */
1349
- async deleteDocuments(library, version) {
1350
- try {
1351
- const normalizedVersion = version.toLowerCase();
1352
- const result = this.statements.deleteDocuments.run(
1353
- library.toLowerCase(),
1354
- library.toLowerCase(),
1355
- // library name appears twice in the query
1356
- normalizedVersion
1357
- );
1358
- return result.changes;
1359
- } catch (error) {
1360
- throw new ConnectionError("Failed to delete documents", error);
1361
- }
1362
- }
1363
- /**
1364
- * Removes documents for a specific URL within a library and version
1365
- * @returns Number of documents deleted
1366
- */
1367
- async deleteDocumentsByUrl(library, version, url) {
1368
- try {
1369
- const normalizedVersion = version.toLowerCase();
1370
- const result = this.statements.deleteDocumentsByUrl.run(
1371
- url,
1372
- library.toLowerCase(),
1373
- library.toLowerCase(),
1374
- // library name appears twice in the query
1375
- normalizedVersion
1376
- );
1377
- return result.changes;
1378
- } catch (error) {
1379
- throw new ConnectionError("Failed to delete documents by URL", error);
1380
- }
1381
- }
1382
- /**
1383
- * Completely removes a library version and all associated documents.
1384
- * Optionally removes the library if no other versions remain.
1385
- * @param library Library name
1386
- * @param version Version string (empty string for unversioned)
1387
- * @param removeLibraryIfEmpty Whether to remove the library if no versions remain
1388
- * @returns Object with counts of deleted documents, version deletion status, and library deletion status
1389
- */
1390
- async removeVersion(library, version, removeLibraryIfEmpty = true) {
1391
- try {
1392
- const normalizedLibrary = library.toLowerCase();
1393
- const normalizedVersion = version.toLowerCase();
1394
- const versionResult = this.statements.getVersionId.get(
1395
- normalizedLibrary,
1396
- normalizedVersion
1397
- );
1398
- if (!versionResult) {
1399
- return { documentsDeleted: 0, versionDeleted: false, libraryDeleted: false };
1400
- }
1401
- const { id: versionId, library_id: libraryId } = versionResult;
1402
- const documentsDeleted = await this.deleteDocuments(library, version);
1403
- const versionDeleteResult = this.statements.deleteVersionById.run(versionId);
1404
- const versionDeleted = versionDeleteResult.changes > 0;
1405
- let libraryDeleted = false;
1406
- if (removeLibraryIfEmpty && versionDeleted) {
1407
- const countResult = this.statements.countVersionsByLibraryId.get(libraryId);
1408
- const remainingVersions = countResult?.count ?? 0;
1409
- if (remainingVersions === 0) {
1410
- const libraryDeleteResult = this.statements.deleteLibraryById.run(libraryId);
1411
- libraryDeleted = libraryDeleteResult.changes > 0;
1412
- }
1413
- }
1414
- return { documentsDeleted, versionDeleted, libraryDeleted };
1415
- } catch (error) {
1416
- throw new ConnectionError("Failed to remove version", error);
1417
- }
1418
- }
1419
- /**
1420
- * Retrieves a document by its ID.
1421
- * @param id The ID of the document.
1422
- * @returns The document, or null if not found.
1423
- */
1424
- async getById(id) {
1425
- try {
1426
- const row = this.statements.getById.get(BigInt(id));
1427
- if (!row) {
1428
- return null;
1429
- }
1430
- return mapDbDocumentToDocument(row);
1431
- } catch (error) {
1432
- throw new ConnectionError(`Failed to get document by ID ${id}`, error);
1433
- }
1434
- }
1435
- /**
1436
- * Finds documents matching a text query using hybrid search.
1437
- * Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
1438
- */
1439
- async findByContent(library, version, query, limit) {
1440
- try {
1441
- const rawEmbedding = await this.embeddings.embedQuery(query);
1442
- const embedding = this.padVector(rawEmbedding);
1443
- const ftsQuery = this.escapeFtsQuery(query);
1444
- const normalizedVersion = version.toLowerCase();
1445
- const stmt = this.db.prepare(`
1446
- WITH vec_distances AS (
1447
- SELECT
1448
- dv.rowid as id,
1449
- dv.distance as vec_distance
1450
- FROM documents_vec dv
1451
- JOIN versions v ON dv.version_id = v.id
1452
- JOIN libraries l ON v.library_id = l.id
1453
- WHERE l.name = ?
1454
- AND COALESCE(v.name, '') = COALESCE(?, '')
1455
- AND dv.embedding MATCH ?
1456
- AND dv.k = ?
1457
- ORDER BY dv.distance
1458
- ),
1459
- fts_scores AS (
1460
- SELECT
1461
- f.rowid as id,
1462
- bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
1463
- FROM documents_fts f
1464
- JOIN documents d ON f.rowid = d.id
1465
- JOIN versions v ON d.version_id = v.id
1466
- JOIN libraries l ON v.library_id = l.id
1467
- WHERE l.name = ?
1468
- AND COALESCE(v.name, '') = COALESCE(?, '')
1469
- AND documents_fts MATCH ?
1470
- ORDER BY fts_score
1471
- LIMIT ?
1472
- )
1473
- SELECT
1474
- d.id,
1475
- d.content,
1476
- d.metadata,
1477
- COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
1478
- COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
1479
- FROM documents d
1480
- LEFT JOIN vec_distances v ON d.id = v.id
1481
- LEFT JOIN fts_scores f ON d.id = f.id
1482
- WHERE v.id IS NOT NULL OR f.id IS NOT NULL
1483
- `);
1484
- const rawResults = stmt.all(
1485
- library.toLowerCase(),
1486
- normalizedVersion,
1487
- JSON.stringify(embedding),
1488
- limit,
1489
- library.toLowerCase(),
1490
- normalizedVersion,
1491
- ftsQuery,
1492
- // Use the escaped query
1493
- limit
1494
- );
1495
- const rankedResults = this.assignRanks(rawResults);
1496
- const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
1497
- return topResults.map((row) => ({
1498
- ...mapDbDocumentToDocument(row),
1499
- metadata: {
1500
- ...JSON.parse(row.metadata),
1501
- id: row.id,
1502
- score: row.rrf_score,
1503
- vec_rank: row.vec_rank,
1504
- fts_rank: row.fts_rank
1505
- }
1506
- }));
1507
- } catch (error) {
1508
- throw new ConnectionError(
1509
- `Failed to find documents by content with query "${query}"`,
1510
- error
1511
- );
1512
- }
1513
- }
1514
- /**
1515
- * Finds child chunks of a given document based on path hierarchy.
1516
- */
1517
- async findChildChunks(library, version, id, limit) {
1518
- try {
1519
- const parent = await this.getById(id);
1520
- if (!parent) {
1521
- return [];
1522
- }
1523
- const parentPath = parent.metadata.path ?? [];
1524
- const parentUrl = parent.metadata.url;
1525
- const normalizedVersion = version.toLowerCase();
1526
- const result = this.statements.getChildChunks.all(
1527
- library.toLowerCase(),
1528
- normalizedVersion,
1529
- parentUrl,
1530
- parentPath.length + 1,
1531
- JSON.stringify(parentPath),
1532
- BigInt(id),
1533
- limit
1534
- );
1535
- return result.map((row) => mapDbDocumentToDocument(row));
1536
- } catch (error) {
1537
- throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
1538
- }
1539
- }
1540
- /**
1541
- * Finds preceding sibling chunks of a given document.
1542
- */
1543
- async findPrecedingSiblingChunks(library, version, id, limit) {
1544
- try {
1545
- const reference = await this.getById(id);
1546
- if (!reference) {
1547
- return [];
1548
- }
1549
- const refMetadata = reference.metadata;
1550
- const normalizedVersion = version.toLowerCase();
1551
- const result = this.statements.getPrecedingSiblings.all(
1552
- library.toLowerCase(),
1553
- normalizedVersion,
1554
- refMetadata.url,
1555
- BigInt(id),
1556
- JSON.stringify(refMetadata.path),
1557
- limit
1558
- );
1559
- return result.reverse().map((row) => mapDbDocumentToDocument(row));
1560
- } catch (error) {
1561
- throw new ConnectionError(
1562
- `Failed to find preceding sibling chunks for ID ${id}`,
1563
- error
1564
- );
1565
- }
1566
- }
1567
- /**
1568
- * Finds subsequent sibling chunks of a given document.
1569
- */
1570
- async findSubsequentSiblingChunks(library, version, id, limit) {
1571
- try {
1572
- const reference = await this.getById(id);
1573
- if (!reference) {
1574
- return [];
1575
- }
1576
- const refMetadata = reference.metadata;
1577
- const normalizedVersion = version.toLowerCase();
1578
- const result = this.statements.getSubsequentSiblings.all(
1579
- library.toLowerCase(),
1580
- normalizedVersion,
1581
- refMetadata.url,
1582
- BigInt(id),
1583
- JSON.stringify(refMetadata.path),
1584
- limit
1585
- );
1586
- return result.map((row) => mapDbDocumentToDocument(row));
1587
- } catch (error) {
1588
- throw new ConnectionError(
1589
- `Failed to find subsequent sibling chunks for ID ${id}`,
1590
- error
1591
- );
1592
- }
1593
- }
1594
- /**
1595
- * Finds the parent chunk of a given document.
1596
- */
1597
- async findParentChunk(library, version, id) {
1598
- try {
1599
- const child = await this.getById(id);
1600
- if (!child) {
1601
- return null;
1602
- }
1603
- const childMetadata = child.metadata;
1604
- const path2 = childMetadata.path ?? [];
1605
- const parentPath = path2.slice(0, -1);
1606
- if (parentPath.length === 0) {
1607
- return null;
1608
- }
1609
- const normalizedVersion = version.toLowerCase();
1610
- const result = this.statements.getParentChunk.get(
1611
- library.toLowerCase(),
1612
- normalizedVersion,
1613
- childMetadata.url,
1614
- JSON.stringify(parentPath),
1615
- BigInt(id)
1616
- );
1617
- if (!result) {
1618
- return null;
1619
- }
1620
- return mapDbDocumentToDocument(result);
1621
- } catch (error) {
1622
- throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
1623
- }
1624
- }
1625
- /**
1626
- * Fetches multiple documents by their IDs in a single call.
1627
- * Returns an array of Document objects, sorted by their sort_order.
1628
- */
1629
- async findChunksByIds(library, version, ids) {
1630
- if (!ids.length) return [];
1631
- try {
1632
- const normalizedVersion = version.toLowerCase();
1633
- const placeholders = ids.map(() => "?").join(",");
1634
- const stmt = this.db.prepare(
1635
- `SELECT d.* FROM documents d
1636
- JOIN libraries l ON d.library_id = l.id
1637
- JOIN versions v ON d.version_id = v.id
1638
- WHERE l.name = ?
1639
- AND COALESCE(v.name, '') = COALESCE(?, '')
1640
- AND d.id IN (${placeholders})
1641
- ORDER BY d.sort_order`
1642
- );
1643
- const rows = stmt.all(
1644
- library.toLowerCase(),
1645
- normalizedVersion,
1646
- ...ids
1647
- );
1648
- return rows.map((row) => mapDbDocumentToDocument(row));
1649
- } catch (error) {
1650
- throw new ConnectionError("Failed to fetch documents by IDs", error);
1651
- }
1652
- }
1653
- }
1654
- class DocumentManagementService {
1655
- store;
1656
- documentRetriever;
1657
- splitter;
1658
- /**
1659
- * Normalizes a version string, converting null or undefined to an empty string
1660
- * and converting to lowercase.
1661
- */
1662
- normalizeVersion(version) {
1663
- return (version ?? "").toLowerCase();
1664
- }
1665
- constructor() {
1666
- let dbPath;
1667
- let dbDir;
1668
- const envStorePath = process.env.DOCS_MCP_STORE_PATH;
1669
- if (envStorePath) {
1670
- dbDir = envStorePath;
1671
- dbPath = path.join(dbDir, "documents.db");
1672
- logger.debug(`Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
1673
- } else {
1674
- const projectRoot = getProjectRoot();
1675
- const oldDbDir = path.join(projectRoot, ".store");
1676
- const oldDbPath = path.join(oldDbDir, "documents.db");
1677
- const oldDbExists = fs.existsSync(oldDbPath);
1678
- if (oldDbExists) {
1679
- dbPath = oldDbPath;
1680
- dbDir = oldDbDir;
1681
- logger.debug(`Using legacy database path: ${dbPath}`);
1682
- } else {
1683
- const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
1684
- dbDir = standardPaths.data;
1685
- dbPath = path.join(dbDir, "documents.db");
1686
- logger.debug(`Using standard database directory: ${dbDir}`);
1687
- }
1688
- }
1689
- try {
1690
- fs.mkdirSync(dbDir, { recursive: true });
1691
- } catch (error) {
1692
- logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
1693
- }
1694
- this.store = new DocumentStore(dbPath);
1695
- this.documentRetriever = new DocumentRetrieverService(this.store);
1696
- const semanticSplitter = new SemanticMarkdownSplitter(
1697
- SPLITTER_PREFERRED_CHUNK_SIZE,
1698
- SPLITTER_MAX_CHUNK_SIZE
1699
- );
1700
- const greedySplitter = new GreedySplitter(
1701
- semanticSplitter,
1702
- SPLITTER_MIN_CHUNK_SIZE,
1703
- SPLITTER_PREFERRED_CHUNK_SIZE
1704
- );
1705
- this.splitter = greedySplitter;
1706
- }
1707
- /**
1708
- * Initializes the underlying document store.
1709
- */
1710
- async initialize() {
1711
- await this.store.initialize();
1712
- }
1713
- /**
1714
- * Shuts down the underlying document store.
1715
- */
1716
- async shutdown() {
1717
- logger.debug("Shutting down store manager");
1718
- await this.store.shutdown();
1719
- }
1720
- // Status tracking methods for pipeline integration
1721
- /**
1722
- * Gets versions by their current status.
1723
- */
1724
- async getVersionsByStatus(statuses) {
1725
- return this.store.getVersionsByStatus(statuses);
1726
- }
1727
- /**
1728
- * Updates the status of a version.
1729
- */
1730
- async updateVersionStatus(versionId, status, errorMessage) {
1731
- return this.store.updateVersionStatus(versionId, status, errorMessage);
1732
- }
1733
- /**
1734
- * Updates the progress of a version being indexed.
1735
- */
1736
- async updateVersionProgress(versionId, pages, maxPages) {
1737
- return this.store.updateVersionProgress(versionId, pages, maxPages);
1738
- }
1739
- /**
1740
- * Stores scraper options for a version to enable reproducible indexing.
1741
- */
1742
- async storeScraperOptions(versionId, options) {
1743
- return this.store.storeScraperOptions(versionId, options);
1744
- }
1745
- /**
1746
- * Retrieves stored scraper options for a version.
1747
- */
1748
- /**
1749
- * Retrieves stored scraping configuration for a version.
1750
- */
1751
- async getScraperOptions(versionId) {
1752
- return this.store.getScraperOptions(versionId);
1753
- }
1754
- /**
1755
- * Ensures a library/version exists using a VersionRef and returns version ID.
1756
- * Delegates to existing ensureLibraryAndVersion for storage.
1757
- */
1758
- async ensureVersion(ref) {
1759
- const normalized = {
1760
- library: ref.library.trim().toLowerCase(),
1761
- version: (ref.version ?? "").trim().toLowerCase()
1762
- };
1763
- return this.ensureLibraryAndVersion(normalized.library, normalized.version);
1764
- }
1765
- /**
1766
- * Returns enriched library summaries including version status/progress and counts.
1767
- * Uses existing store APIs; keeps DB details encapsulated.
1768
- */
1769
- async listLibraries() {
1770
- const libMap = await this.store.queryLibraryVersions();
1771
- const summaries = [];
1772
- for (const [library, versions] of libMap) {
1773
- const vs = versions.map(
1774
- (v) => ({
1775
- id: v.versionId,
1776
- ref: { library, version: v.version },
1777
- status: v.status,
1778
- // Include progress only while indexing is active; set undefined for COMPLETED
1779
- progress: v.status === "completed" ? void 0 : { pages: v.progressPages, maxPages: v.progressMaxPages },
1780
- counts: { documents: v.documentCount, uniqueUrls: v.uniqueUrlCount },
1781
- indexedAt: v.indexedAt,
1782
- sourceUrl: v.sourceUrl ?? void 0
1783
- })
1784
- );
1785
- summaries.push({ library, versions: vs });
1786
- }
1787
- return summaries;
1788
- }
1789
- /**
1790
- * Finds versions that were indexed from the same source URL.
1791
- */
1792
- async findVersionsBySourceUrl(url) {
1793
- return this.store.findVersionsBySourceUrl(url);
1794
- }
1795
- /**
1796
- * Validates if a library exists in the store (either versioned or unversioned).
1797
- * Throws LibraryNotFoundError with suggestions if the library is not found.
1798
- * @param library The name of the library to validate.
1799
- * @throws {LibraryNotFoundError} If the library does not exist.
1800
- */
1801
- async validateLibraryExists(library) {
1802
- logger.info(`🔎 Validating existence of library: ${library}`);
1803
- const normalizedLibrary = library.toLowerCase();
1804
- const versions = await this.listVersions(normalizedLibrary);
1805
- const hasUnversioned = await this.exists(normalizedLibrary, "");
1806
- if (versions.length === 0 && !hasUnversioned) {
1807
- logger.warn(`⚠️ Library '${library}' not found.`);
1808
- const allLibraries = await this.listLibraries();
1809
- const libraryNames = allLibraries.map((lib) => lib.library);
1810
- let suggestions = [];
1811
- if (libraryNames.length > 0) {
1812
- const fuse = new Fuse(libraryNames, {
1813
- // Configure fuse.js options if needed (e.g., threshold)
1814
- // isCaseSensitive: false, // Handled by normalizing library names
1815
- // includeScore: true,
1816
- threshold: 0.4
1817
- // Adjust threshold for desired fuzziness (0=exact, 1=match anything)
1818
- });
1819
- const results = fuse.search(normalizedLibrary);
1820
- suggestions = results.slice(0, 3).map((result) => result.item);
1821
- logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
1822
- }
1823
- throw new LibraryNotFoundError(library, suggestions);
1824
- }
1825
- logger.info(`✅ Library '${library}' confirmed to exist.`);
1826
- }
1827
- /**
1828
- * Returns a list of all available semantic versions for a library.
1829
- */
1830
- async listVersions(library) {
1831
- const versions = await this.store.queryUniqueVersions(library);
1832
- return versions.filter((v) => semver__default.valid(v));
1833
- }
1834
- /**
1835
- * Checks if documents exist for a given library and optional version.
1836
- * If version is omitted, checks for documents without a specific version.
1837
- */
1838
- async exists(library, version) {
1839
- const normalizedVersion = this.normalizeVersion(version);
1840
- return this.store.checkDocumentExists(library, normalizedVersion);
1841
- }
1842
- /**
1843
- * Finds the most appropriate version of documentation based on the requested version.
1844
- * When no target version is specified, returns the latest version.
1845
- *
1846
- * Version matching behavior:
1847
- * - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
1848
- * - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
1849
- * - "latest" or no version: Returns the latest available version
1850
- *
1851
- * For documentation, we prefer matching older versions over no match at all,
1852
- * since older docs are often still relevant and useful.
1853
- * Also checks if unversioned documents exist for the library.
1854
- */
1855
- async findBestVersion(library, targetVersion) {
1856
- const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
1857
- logger.info(`🔍 Finding best version for ${libraryAndVersion}`);
1858
- const hasUnversioned = await this.store.checkDocumentExists(library, "");
1859
- const versionStrings = await this.listVersions(library);
1860
- if (versionStrings.length === 0) {
1861
- if (hasUnversioned) {
1862
- logger.info(`ℹ️ Unversioned documents exist for ${library}`);
1863
- return { bestMatch: null, hasUnversioned: true };
1864
- }
1865
- logger.warn(`⚠️ No valid versions found for ${library}`);
1866
- const allLibraryDetails = await this.store.queryLibraryVersions();
1867
- const libraryDetails = allLibraryDetails.get(library) ?? [];
1868
- throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
1869
- }
1870
- let bestMatch = null;
1871
- if (!targetVersion || targetVersion === "latest") {
1872
- bestMatch = semver__default.maxSatisfying(versionStrings, "*");
1873
- } else {
1874
- const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
1875
- if (!versionRegex.test(targetVersion)) {
1876
- logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
1877
- } else {
1878
- let range = targetVersion;
1879
- if (!semver__default.validRange(targetVersion)) {
1880
- range = `~${targetVersion}`;
1881
- } else if (semver__default.valid(targetVersion)) {
1882
- range = `${range} || <=${targetVersion}`;
1883
- }
1884
- bestMatch = semver__default.maxSatisfying(versionStrings, range);
1885
- }
1886
- }
1887
- if (bestMatch) {
1888
- logger.info(`✅ Found best match version ${bestMatch} for ${libraryAndVersion}`);
1889
- } else {
1890
- logger.warn(`⚠️ No matching semver version found for ${libraryAndVersion}`);
1891
- }
1892
- if (!bestMatch && !hasUnversioned) {
1893
- const allLibraryDetails = await this.store.queryLibraryVersions();
1894
- const libraryDetails = allLibraryDetails.get(library) ?? [];
1895
- throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
1896
- }
1897
- return { bestMatch, hasUnversioned };
1898
- }
1899
- /**
1900
- * Removes all documents for a specific library and optional version.
1901
- * If version is omitted, removes documents without a specific version.
1902
- */
1903
- async removeAllDocuments(library, version) {
1904
- const normalizedVersion = this.normalizeVersion(version);
1905
- logger.info(
1906
- `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
1907
- );
1908
- const count = await this.store.deleteDocuments(library, normalizedVersion);
1909
- logger.info(`🗑️ Deleted ${count} documents`);
1910
- }
1911
- /**
1912
- * Completely removes a library version and all associated documents.
1913
- * Also removes the library if no other versions remain.
1914
- * @param library Library name
1915
- * @param version Version string (null/undefined for unversioned)
1916
- */
1917
- async removeVersion(library, version) {
1918
- const normalizedVersion = this.normalizeVersion(version);
1919
- logger.info(`🗑️ Removing version: ${library}@${normalizedVersion || "[no version]"}`);
1920
- const result = await this.store.removeVersion(library, normalizedVersion, true);
1921
- logger.info(
1922
- `🗑️ Removed ${result.documentsDeleted} documents, version: ${result.versionDeleted}, library: ${result.libraryDeleted}`
1923
- );
1924
- if (result.versionDeleted && result.libraryDeleted) {
1925
- logger.info(`✅ Completely removed library ${library} (was last version)`);
1926
- } else if (result.versionDeleted) {
1927
- logger.info(`✅ Removed version ${library}@${normalizedVersion || "[no version]"}`);
1928
- } else {
1929
- logger.warn(
1930
- `⚠️ Version ${library}@${normalizedVersion || "[no version]"} not found`
1931
- );
1932
- }
1933
- }
1934
- /**
1935
- * Adds a document to the store, splitting it into smaller chunks for better search results.
1936
- * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
1937
- * Preserves hierarchical structure of documents and distinguishes between text and code segments.
1938
- * If version is omitted, the document is added without a specific version.
1939
- */
1940
- async addDocument(library, version, document) {
1941
- const processingStart = performance.now();
1942
- const normalizedVersion = this.normalizeVersion(version);
1943
- const url = document.metadata.url;
1944
- if (!url || typeof url !== "string" || !url.trim()) {
1945
- throw new StoreError("Document metadata must include a valid URL");
1946
- }
1947
- logger.info(`📚 Adding document: ${document.metadata.title}`);
1948
- if (!document.pageContent.trim()) {
1949
- throw new Error("Document content cannot be empty");
1950
- }
1951
- try {
1952
- const chunks = await this.splitter.splitText(document.pageContent);
1953
- const splitDocs = chunks.map((chunk) => ({
1954
- pageContent: chunk.content,
1955
- metadata: {
1956
- ...document.metadata,
1957
- level: chunk.section.level,
1958
- path: chunk.section.path
1959
- }
1960
- }));
1961
- logger.info(`✂️ Split document into ${splitDocs.length} chunks`);
1962
- await this.store.addDocuments(library, normalizedVersion, splitDocs);
1963
- const processingTime = performance.now() - processingStart;
1964
- analytics.track(TelemetryEvent.DOCUMENT_PROCESSED, {
1965
- // Content characteristics (privacy-safe)
1966
- mimeType: document.metadata.mimeType,
1967
- contentSizeBytes: document.pageContent.length,
1968
- // Processing metrics
1969
- processingTimeMs: Math.round(processingTime),
1970
- chunksCreated: splitDocs.length,
1971
- // Document characteristics
1972
- hasTitle: !!document.metadata.title,
1973
- hasDescription: !!document.metadata.description,
1974
- urlDomain: extractHostname(url),
1975
- depth: document.metadata.depth,
1976
- // Library context
1977
- library,
1978
- libraryVersion: normalizedVersion || null,
1979
- // Processing efficiency
1980
- avgChunkSizeBytes: Math.round(document.pageContent.length / splitDocs.length),
1981
- processingSpeedKbPerSec: Math.round(
1982
- document.pageContent.length / 1024 / (processingTime / 1e3)
1983
- )
1984
- });
1985
- } catch (error) {
1986
- const processingTime = performance.now() - processingStart;
1987
- analytics.track(TelemetryEvent.DOCUMENT_PROCESSING_FAILED, {
1988
- mimeType: document.metadata.mimeType,
1989
- contentSizeBytes: document.pageContent.length,
1990
- processingTimeMs: Math.round(processingTime),
1991
- errorType: error instanceof Error ? error.constructor.name : "UnknownError",
1992
- errorMessage: error instanceof Error ? error.message : "Unknown error",
1993
- library,
1994
- libraryVersion: normalizedVersion || null
1995
- });
1996
- throw error;
1997
- }
1998
- }
1999
- /**
2000
- * Searches for documentation content across versions.
2001
- * Uses hybrid search (vector + FTS).
2002
- * If version is omitted, searches documents without a specific version.
2003
- */
2004
- async searchStore(library, version, query, limit = 5) {
2005
- const normalizedVersion = this.normalizeVersion(version);
2006
- return this.documentRetriever.search(library, normalizedVersion, query, limit);
2007
- }
2008
- // Deprecated simple listing removed: enriched listLibraries() is canonical
2009
- /**
2010
- * Ensures a library and version exist in the database and returns the version ID.
2011
- * Creates the library and version records if they don't exist.
2012
- */
2013
- async ensureLibraryAndVersion(library, version) {
2014
- const normalizedLibrary = library.toLowerCase();
2015
- const normalizedVersion = this.normalizeVersion(version);
2016
- const { versionId } = await this.store.resolveLibraryAndVersionIds(
2017
- normalizedLibrary,
2018
- normalizedVersion
2019
- );
2020
- return versionId;
2021
- }
2022
- }
2023
- export {
2024
- DocumentManagementService
2025
- };
2026
- //# sourceMappingURL=DocumentManagementService-C1xAzouZ.js.map