@arabold/docs-mcp-server 1.18.0 â 1.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -6
- package/db/migrations/007-dedupe-unversioned-versions.sql +62 -0
- package/db/migrations/008-case-insensitive-names.sql +10 -0
- package/dist/DocumentManagementClient-CAFdDwTu.js +57 -0
- package/dist/DocumentManagementClient-CAFdDwTu.js.map +1 -0
- package/dist/DocumentManagementService-BH02TJEe.js +1917 -0
- package/dist/DocumentManagementService-BH02TJEe.js.map +1 -0
- package/dist/index.js +908 -2561
- package/dist/index.js.map +1 -1
- package/package.json +3 -1
|
@@ -0,0 +1,1917 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import envPaths from "env-paths";
|
|
4
|
+
import Fuse from "fuse.js";
|
|
5
|
+
import semver__default from "semver";
|
|
6
|
+
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
7
|
+
import remarkGfm from "remark-gfm";
|
|
8
|
+
import remarkHtml from "remark-html";
|
|
9
|
+
import remarkParse from "remark-parse";
|
|
10
|
+
import TurndownService from "turndown";
|
|
11
|
+
import { unified } from "unified";
|
|
12
|
+
import { l as logger, c as createJSDOM, V as VECTOR_DIMENSION, S as StoreError, D as DimensionError, a as applyMigrations, C as ConnectionError, d as denormalizeVersionName, n as normalizeVersionName, E as EMBEDDING_BATCH_CHARS, b as EMBEDDING_BATCH_SIZE, m as mapDbDocumentToDocument, g as getProjectRoot, e as SPLITTER_PREFERRED_CHUNK_SIZE, f as SPLITTER_MAX_CHUNK_SIZE, L as LibraryNotFoundError, h as VersionNotFoundError, i as SPLITTER_MIN_CHUNK_SIZE } from "./index.js";
|
|
13
|
+
import "cheerio";
|
|
14
|
+
import "node:vm";
|
|
15
|
+
import "jsdom";
|
|
16
|
+
import "playwright";
|
|
17
|
+
import "@joplin/turndown-plugin-gfm";
|
|
18
|
+
import "iconv-lite";
|
|
19
|
+
import Database from "better-sqlite3";
|
|
20
|
+
import * as sqliteVec from "sqlite-vec";
|
|
21
|
+
class SplitterError extends Error {
|
|
22
|
+
}
|
|
23
|
+
class MinimumChunkSizeError extends SplitterError {
|
|
24
|
+
constructor(size, maxSize) {
|
|
25
|
+
super(
|
|
26
|
+
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
class ContentSplitterError extends SplitterError {
|
|
31
|
+
}
|
|
32
|
+
class GreedySplitter {
|
|
33
|
+
baseSplitter;
|
|
34
|
+
minChunkSize;
|
|
35
|
+
preferredChunkSize;
|
|
36
|
+
/**
|
|
37
|
+
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
38
|
+
* The base splitter handles the initial semantic splitting, while this class handles
|
|
39
|
+
* the concatenation strategy.
|
|
40
|
+
*/
|
|
41
|
+
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
42
|
+
this.baseSplitter = baseSplitter;
|
|
43
|
+
this.minChunkSize = minChunkSize;
|
|
44
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
48
|
+
* are combined until they reach the minimum size, but splits are preserved at major
|
|
49
|
+
* section boundaries to maintain document structure. This balances the need for
|
|
50
|
+
* context with semantic coherence.
|
|
51
|
+
*/
|
|
52
|
+
async splitText(markdown) {
|
|
53
|
+
const initialChunks = await this.baseSplitter.splitText(markdown);
|
|
54
|
+
const concatenatedChunks = [];
|
|
55
|
+
let currentChunk = null;
|
|
56
|
+
for (const nextChunk of initialChunks) {
|
|
57
|
+
if (currentChunk) {
|
|
58
|
+
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
59
|
+
concatenatedChunks.push(currentChunk);
|
|
60
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
64
|
+
concatenatedChunks.push(currentChunk);
|
|
65
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
currentChunk.content += `
|
|
69
|
+
${nextChunk.content}`;
|
|
70
|
+
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
71
|
+
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
72
|
+
} else {
|
|
73
|
+
currentChunk = this.cloneChunk(nextChunk);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
if (currentChunk) {
|
|
77
|
+
concatenatedChunks.push(currentChunk);
|
|
78
|
+
}
|
|
79
|
+
return concatenatedChunks;
|
|
80
|
+
}
|
|
81
|
+
cloneChunk(chunk) {
|
|
82
|
+
return {
|
|
83
|
+
types: [...chunk.types],
|
|
84
|
+
content: chunk.content,
|
|
85
|
+
section: {
|
|
86
|
+
level: chunk.section.level,
|
|
87
|
+
path: [...chunk.section.path]
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
93
|
+
* Preserving these splits helps maintain the document's logical structure.
|
|
94
|
+
*/
|
|
95
|
+
startsNewMajorSection(chunk) {
|
|
96
|
+
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Size limit check to ensure chunks remain within embedding model constraints.
|
|
100
|
+
* Essential for maintaining consistent embedding quality and avoiding truncation.
|
|
101
|
+
*/
|
|
102
|
+
wouldExceedMaxSize(currentChunk, nextChunk) {
|
|
103
|
+
if (!currentChunk) {
|
|
104
|
+
return false;
|
|
105
|
+
}
|
|
106
|
+
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
110
|
+
*/
|
|
111
|
+
isPathIncluded(parentPath, childPath) {
|
|
112
|
+
if (parentPath.length >= childPath.length) return false;
|
|
113
|
+
return parentPath.every((part, i) => part === childPath[i]);
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Merges section metadata when concatenating chunks, following these rules:
|
|
117
|
+
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
118
|
+
* 2. Path selection:
|
|
119
|
+
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
120
|
+
* - For siblings/unrelated sections, uses the common parent path
|
|
121
|
+
* - If no common path exists, uses the root path ([])
|
|
122
|
+
*/
|
|
123
|
+
mergeSectionInfo(currentChunk, nextChunk) {
|
|
124
|
+
const level = Math.min(currentChunk.section.level, nextChunk.section.level);
|
|
125
|
+
if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
|
|
126
|
+
return currentChunk.section;
|
|
127
|
+
}
|
|
128
|
+
if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
|
|
129
|
+
return {
|
|
130
|
+
path: nextChunk.section.path,
|
|
131
|
+
level
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
|
|
135
|
+
return {
|
|
136
|
+
path: currentChunk.section.path,
|
|
137
|
+
level
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
const commonPath = this.findCommonPrefix(
|
|
141
|
+
currentChunk.section.path,
|
|
142
|
+
nextChunk.section.path
|
|
143
|
+
);
|
|
144
|
+
return {
|
|
145
|
+
path: commonPath,
|
|
146
|
+
level
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
mergeTypes(currentTypes, nextTypes) {
|
|
150
|
+
return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Returns longest common prefix between two paths
|
|
154
|
+
*/
|
|
155
|
+
findCommonPrefix(path1, path2) {
|
|
156
|
+
const common = [];
|
|
157
|
+
for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
|
|
158
|
+
if (path1[i] === path2[i]) {
|
|
159
|
+
common.push(path1[i]);
|
|
160
|
+
} else {
|
|
161
|
+
break;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return common;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
const fullTrim = (str) => {
|
|
168
|
+
return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
|
|
169
|
+
};
|
|
170
|
+
class CodeContentSplitter {
|
|
171
|
+
constructor(options) {
|
|
172
|
+
this.options = options;
|
|
173
|
+
}
|
|
174
|
+
async split(content) {
|
|
175
|
+
const language = content.match(/^```(\w+)\n/)?.[1];
|
|
176
|
+
const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
|
|
177
|
+
const lines = strippedContent.split("\n");
|
|
178
|
+
const chunks = [];
|
|
179
|
+
let currentChunkLines = [];
|
|
180
|
+
for (const line of lines) {
|
|
181
|
+
const singleLineSize = this.wrap(line, language).length;
|
|
182
|
+
if (singleLineSize > this.options.chunkSize) {
|
|
183
|
+
throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
|
|
184
|
+
}
|
|
185
|
+
currentChunkLines.push(line);
|
|
186
|
+
const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
|
|
187
|
+
const newChunkSize = newChunkContent.length;
|
|
188
|
+
if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
|
|
189
|
+
const lastLine = currentChunkLines.pop();
|
|
190
|
+
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
191
|
+
currentChunkLines = [lastLine];
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
if (currentChunkLines.length > 0) {
|
|
195
|
+
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
196
|
+
}
|
|
197
|
+
return chunks;
|
|
198
|
+
}
|
|
199
|
+
wrap(content, language) {
|
|
200
|
+
return `\`\`\`${language || ""}
|
|
201
|
+
${content.replace(/\n+$/, "")}
|
|
202
|
+
\`\`\``;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
class TableContentSplitter {
|
|
206
|
+
constructor(options) {
|
|
207
|
+
this.options = options;
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Splits table content into chunks while preserving table structure
|
|
211
|
+
*/
|
|
212
|
+
async split(content) {
|
|
213
|
+
const parsedTable = this.parseTable(content);
|
|
214
|
+
if (!parsedTable) {
|
|
215
|
+
return [content];
|
|
216
|
+
}
|
|
217
|
+
const { headers, rows } = parsedTable;
|
|
218
|
+
const chunks = [];
|
|
219
|
+
let currentRows = [];
|
|
220
|
+
for (const row of rows) {
|
|
221
|
+
const singleRowSize = this.wrap(row, headers).length;
|
|
222
|
+
if (singleRowSize > this.options.chunkSize) {
|
|
223
|
+
throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
|
|
224
|
+
}
|
|
225
|
+
const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
|
|
226
|
+
const newChunkSize = newChunkContent.length;
|
|
227
|
+
if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
|
|
228
|
+
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
229
|
+
currentRows = [row];
|
|
230
|
+
} else {
|
|
231
|
+
currentRows.push(row);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
if (currentRows.length > 0) {
|
|
235
|
+
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
236
|
+
}
|
|
237
|
+
return chunks;
|
|
238
|
+
}
|
|
239
|
+
wrap(content, headers) {
|
|
240
|
+
const headerRow = `| ${headers.join(" | ")} |`;
|
|
241
|
+
const separatorRow = `|${headers.map(() => "---").join("|")}|`;
|
|
242
|
+
return [headerRow, separatorRow, content].join("\n");
|
|
243
|
+
}
|
|
244
|
+
parseTable(content) {
|
|
245
|
+
const lines = content.trim().split("\n");
|
|
246
|
+
if (lines.length < 3) return null;
|
|
247
|
+
const headers = this.parseRow(lines[0]);
|
|
248
|
+
if (!headers) return null;
|
|
249
|
+
const separator = lines[1];
|
|
250
|
+
if (!this.isValidSeparator(separator)) return null;
|
|
251
|
+
const rows = lines.slice(2).filter((row) => row.trim() !== "");
|
|
252
|
+
return { headers, separator, rows };
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Parses a table row into cells
|
|
256
|
+
*/
|
|
257
|
+
parseRow(row) {
|
|
258
|
+
if (!row.includes("|")) return null;
|
|
259
|
+
return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* Validates the separator row of the table
|
|
263
|
+
*/
|
|
264
|
+
isValidSeparator(separator) {
|
|
265
|
+
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
class TextContentSplitter {
|
|
269
|
+
constructor(options) {
|
|
270
|
+
this.options = options;
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
274
|
+
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
275
|
+
*/
|
|
276
|
+
async split(content) {
|
|
277
|
+
const trimmedContent = fullTrim(content);
|
|
278
|
+
if (trimmedContent.length <= this.options.chunkSize) {
|
|
279
|
+
return [trimmedContent];
|
|
280
|
+
}
|
|
281
|
+
const words = trimmedContent.split(/\s+/);
|
|
282
|
+
const longestWord = words.reduce(
|
|
283
|
+
(max, word) => word.length > max.length ? word : max
|
|
284
|
+
);
|
|
285
|
+
if (longestWord.length > this.options.chunkSize) {
|
|
286
|
+
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
287
|
+
}
|
|
288
|
+
const paragraphChunks = this.splitByParagraphs(trimmedContent);
|
|
289
|
+
if (this.areChunksValid(paragraphChunks)) {
|
|
290
|
+
return paragraphChunks;
|
|
291
|
+
}
|
|
292
|
+
const lineChunks = this.splitByLines(trimmedContent);
|
|
293
|
+
if (this.areChunksValid(lineChunks)) {
|
|
294
|
+
return this.mergeChunks(lineChunks, "\n");
|
|
295
|
+
}
|
|
296
|
+
const wordChunks = await this.splitByWords(trimmedContent);
|
|
297
|
+
return this.mergeChunks(wordChunks, " ");
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Checks if all chunks are within the maximum size limit
|
|
301
|
+
*/
|
|
302
|
+
areChunksValid(chunks) {
|
|
303
|
+
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
304
|
+
}
|
|
305
|
+
/**
|
|
306
|
+
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
307
|
+
*/
|
|
308
|
+
splitByParagraphs(text) {
|
|
309
|
+
const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
|
|
310
|
+
return paragraphs.filter((chunk) => chunk.length > 2);
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Splits text into chunks by line boundaries
|
|
314
|
+
*/
|
|
315
|
+
splitByLines(text) {
|
|
316
|
+
const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
|
|
317
|
+
return lines.filter((chunk) => chunk.length > 1);
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
321
|
+
*/
|
|
322
|
+
async splitByWords(text) {
|
|
323
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
324
|
+
chunkSize: this.options.chunkSize,
|
|
325
|
+
chunkOverlap: 0
|
|
326
|
+
});
|
|
327
|
+
const chunks = await splitter.splitText(text);
|
|
328
|
+
return chunks;
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
332
|
+
* Only merges if combined size is within maxChunkSize.
|
|
333
|
+
*/
|
|
334
|
+
mergeChunks(chunks, separator) {
|
|
335
|
+
const mergedChunks = [];
|
|
336
|
+
let currentChunk = null;
|
|
337
|
+
for (const chunk of chunks) {
|
|
338
|
+
if (currentChunk === null) {
|
|
339
|
+
currentChunk = chunk;
|
|
340
|
+
continue;
|
|
341
|
+
}
|
|
342
|
+
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
343
|
+
const nextChunkSize = this.getChunkSize(chunk);
|
|
344
|
+
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
345
|
+
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
346
|
+
} else {
|
|
347
|
+
mergedChunks.push(currentChunk);
|
|
348
|
+
currentChunk = chunk;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
if (currentChunk) {
|
|
352
|
+
mergedChunks.push(currentChunk);
|
|
353
|
+
}
|
|
354
|
+
return mergedChunks;
|
|
355
|
+
}
|
|
356
|
+
getChunkSize(chunk) {
|
|
357
|
+
return chunk.length;
|
|
358
|
+
}
|
|
359
|
+
wrap(content) {
|
|
360
|
+
return content;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
class SemanticMarkdownSplitter {
|
|
364
|
+
constructor(preferredChunkSize, maxChunkSize) {
|
|
365
|
+
this.preferredChunkSize = preferredChunkSize;
|
|
366
|
+
this.maxChunkSize = maxChunkSize;
|
|
367
|
+
this.turndownService = new TurndownService({
|
|
368
|
+
headingStyle: "atx",
|
|
369
|
+
hr: "---",
|
|
370
|
+
bulletListMarker: "-",
|
|
371
|
+
codeBlockStyle: "fenced",
|
|
372
|
+
emDelimiter: "_",
|
|
373
|
+
strongDelimiter: "**",
|
|
374
|
+
linkStyle: "inlined"
|
|
375
|
+
});
|
|
376
|
+
this.turndownService.addRule("table", {
|
|
377
|
+
filter: ["table"],
|
|
378
|
+
replacement: (_content, node) => {
|
|
379
|
+
const table = node;
|
|
380
|
+
const headers = Array.from(table.querySelectorAll("th")).map(
|
|
381
|
+
(th) => th.textContent?.trim() || ""
|
|
382
|
+
);
|
|
383
|
+
const rows = Array.from(table.querySelectorAll("tr")).filter(
|
|
384
|
+
(tr) => !tr.querySelector("th")
|
|
385
|
+
);
|
|
386
|
+
if (headers.length === 0 && rows.length === 0) return "";
|
|
387
|
+
let markdown = "\n";
|
|
388
|
+
if (headers.length > 0) {
|
|
389
|
+
markdown += `| ${headers.join(" | ")} |
|
|
390
|
+
`;
|
|
391
|
+
markdown += `|${headers.map(() => "---").join("|")}|
|
|
392
|
+
`;
|
|
393
|
+
}
|
|
394
|
+
for (const row of rows) {
|
|
395
|
+
const cells = Array.from(row.querySelectorAll("td")).map(
|
|
396
|
+
(td) => td.textContent?.trim() || ""
|
|
397
|
+
);
|
|
398
|
+
markdown += `| ${cells.join(" | ")} |
|
|
399
|
+
`;
|
|
400
|
+
}
|
|
401
|
+
return markdown;
|
|
402
|
+
}
|
|
403
|
+
});
|
|
404
|
+
this.textSplitter = new TextContentSplitter({
|
|
405
|
+
chunkSize: this.preferredChunkSize
|
|
406
|
+
});
|
|
407
|
+
this.codeSplitter = new CodeContentSplitter({
|
|
408
|
+
chunkSize: this.maxChunkSize
|
|
409
|
+
});
|
|
410
|
+
this.tableSplitter = new TableContentSplitter({
|
|
411
|
+
chunkSize: this.maxChunkSize
|
|
412
|
+
});
|
|
413
|
+
}
|
|
414
|
+
turndownService;
|
|
415
|
+
textSplitter;
|
|
416
|
+
codeSplitter;
|
|
417
|
+
tableSplitter;
|
|
418
|
+
/**
|
|
419
|
+
* Main entry point for splitting markdown content
|
|
420
|
+
*/
|
|
421
|
+
async splitText(markdown) {
|
|
422
|
+
const html = await this.markdownToHtml(markdown);
|
|
423
|
+
const dom = await this.parseHtml(html);
|
|
424
|
+
const sections = await this.splitIntoSections(dom);
|
|
425
|
+
return this.splitSectionContent(sections);
|
|
426
|
+
}
|
|
427
|
+
/**
|
|
428
|
+
* Step 1: Split document into sections based on H1-H6 headings,
|
|
429
|
+
* as well as code blocks and tables.
|
|
430
|
+
*/
|
|
431
|
+
async splitIntoSections(dom) {
|
|
432
|
+
const body = dom.querySelector("body");
|
|
433
|
+
if (!body) {
|
|
434
|
+
throw new Error("Invalid HTML structure: no body element found");
|
|
435
|
+
}
|
|
436
|
+
let currentSection = this.createRootSection();
|
|
437
|
+
const sections = [];
|
|
438
|
+
const stack = [currentSection];
|
|
439
|
+
for (const element of Array.from(body.children)) {
|
|
440
|
+
const headingMatch = element.tagName.match(/H([1-6])/);
|
|
441
|
+
if (headingMatch) {
|
|
442
|
+
const level = Number.parseInt(headingMatch[1], 10);
|
|
443
|
+
const title = fullTrim(element.textContent || "");
|
|
444
|
+
while (stack.length > 1 && stack[stack.length - 1].level >= level) {
|
|
445
|
+
stack.pop();
|
|
446
|
+
}
|
|
447
|
+
currentSection = {
|
|
448
|
+
level,
|
|
449
|
+
path: [
|
|
450
|
+
...stack.slice(1).reduce((acc, s) => {
|
|
451
|
+
const lastPath = s.path[s.path.length - 1];
|
|
452
|
+
if (lastPath) acc.push(lastPath);
|
|
453
|
+
return acc;
|
|
454
|
+
}, []),
|
|
455
|
+
title
|
|
456
|
+
],
|
|
457
|
+
content: [
|
|
458
|
+
{
|
|
459
|
+
type: "heading",
|
|
460
|
+
text: `${"#".repeat(level)} ${title}`
|
|
461
|
+
}
|
|
462
|
+
]
|
|
463
|
+
};
|
|
464
|
+
sections.push(currentSection);
|
|
465
|
+
stack.push(currentSection);
|
|
466
|
+
} else if (element.tagName === "PRE") {
|
|
467
|
+
const code = element.querySelector("code");
|
|
468
|
+
const language = code?.className.replace("language-", "") || "";
|
|
469
|
+
const content = code?.textContent || element.textContent || "";
|
|
470
|
+
const markdown = `${"```"}${language}
|
|
471
|
+
${content}
|
|
472
|
+
${"```"}`;
|
|
473
|
+
currentSection = {
|
|
474
|
+
level: currentSection.level,
|
|
475
|
+
path: currentSection.path,
|
|
476
|
+
content: [
|
|
477
|
+
{
|
|
478
|
+
type: "code",
|
|
479
|
+
text: markdown
|
|
480
|
+
}
|
|
481
|
+
]
|
|
482
|
+
};
|
|
483
|
+
sections.push(currentSection);
|
|
484
|
+
} else if (element.tagName === "TABLE") {
|
|
485
|
+
const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
|
|
486
|
+
currentSection = {
|
|
487
|
+
level: currentSection.level,
|
|
488
|
+
path: currentSection.path,
|
|
489
|
+
content: [
|
|
490
|
+
{
|
|
491
|
+
type: "table",
|
|
492
|
+
text: markdown
|
|
493
|
+
}
|
|
494
|
+
]
|
|
495
|
+
};
|
|
496
|
+
sections.push(currentSection);
|
|
497
|
+
} else {
|
|
498
|
+
const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
|
|
499
|
+
if (markdown) {
|
|
500
|
+
currentSection = {
|
|
501
|
+
level: currentSection.level,
|
|
502
|
+
path: currentSection.path,
|
|
503
|
+
content: [
|
|
504
|
+
{
|
|
505
|
+
type: "text",
|
|
506
|
+
text: markdown
|
|
507
|
+
}
|
|
508
|
+
]
|
|
509
|
+
};
|
|
510
|
+
sections.push(currentSection);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
return sections;
|
|
515
|
+
}
|
|
516
|
+
/**
|
|
517
|
+
* Step 2: Split section content into smaller chunks
|
|
518
|
+
*/
|
|
519
|
+
async splitSectionContent(sections) {
|
|
520
|
+
const chunks = [];
|
|
521
|
+
for (const section of sections) {
|
|
522
|
+
for (const content of section.content) {
|
|
523
|
+
let splitContent = [];
|
|
524
|
+
try {
|
|
525
|
+
switch (content.type) {
|
|
526
|
+
case "heading":
|
|
527
|
+
case "text": {
|
|
528
|
+
splitContent = await this.textSplitter.split(content.text);
|
|
529
|
+
break;
|
|
530
|
+
}
|
|
531
|
+
case "code": {
|
|
532
|
+
splitContent = await this.codeSplitter.split(content.text);
|
|
533
|
+
break;
|
|
534
|
+
}
|
|
535
|
+
case "table": {
|
|
536
|
+
splitContent = await this.tableSplitter.split(content.text);
|
|
537
|
+
break;
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
} catch (err) {
|
|
541
|
+
if (err instanceof MinimumChunkSizeError) {
|
|
542
|
+
logger.warn(
|
|
543
|
+
`â Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
|
|
544
|
+
);
|
|
545
|
+
const splitter = new RecursiveCharacterTextSplitter({
|
|
546
|
+
chunkSize: this.maxChunkSize,
|
|
547
|
+
chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
|
|
548
|
+
// Use more aggressive separators including empty string as last resort
|
|
549
|
+
separators: [
|
|
550
|
+
"\n\n",
|
|
551
|
+
"\n",
|
|
552
|
+
" ",
|
|
553
|
+
" ",
|
|
554
|
+
".",
|
|
555
|
+
",",
|
|
556
|
+
";",
|
|
557
|
+
":",
|
|
558
|
+
"-",
|
|
559
|
+
"(",
|
|
560
|
+
")",
|
|
561
|
+
"[",
|
|
562
|
+
"]",
|
|
563
|
+
"{",
|
|
564
|
+
"}",
|
|
565
|
+
""
|
|
566
|
+
]
|
|
567
|
+
});
|
|
568
|
+
const chunks2 = await splitter.splitText(content.text);
|
|
569
|
+
if (chunks2.length === 0) {
|
|
570
|
+
splitContent = [content.text.substring(0, this.maxChunkSize)];
|
|
571
|
+
} else {
|
|
572
|
+
splitContent = chunks2;
|
|
573
|
+
}
|
|
574
|
+
} else {
|
|
575
|
+
const errMessage = err instanceof Error ? err.message : String(err);
|
|
576
|
+
throw new ContentSplitterError(
|
|
577
|
+
`Failed to split ${content.type} content: ${errMessage}`
|
|
578
|
+
);
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
chunks.push(
|
|
582
|
+
...splitContent.map(
|
|
583
|
+
(text) => ({
|
|
584
|
+
types: [content.type],
|
|
585
|
+
content: text,
|
|
586
|
+
section: {
|
|
587
|
+
level: section.level,
|
|
588
|
+
path: section.path
|
|
589
|
+
}
|
|
590
|
+
})
|
|
591
|
+
)
|
|
592
|
+
);
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
return chunks;
|
|
596
|
+
}
|
|
597
|
+
/**
|
|
598
|
+
* Helper to create the root section
|
|
599
|
+
*/
|
|
600
|
+
createRootSection() {
|
|
601
|
+
return {
|
|
602
|
+
level: 0,
|
|
603
|
+
path: [],
|
|
604
|
+
content: []
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
/**
|
|
608
|
+
* Convert markdown to HTML using remark
|
|
609
|
+
*/
|
|
610
|
+
async markdownToHtml(markdown) {
|
|
611
|
+
const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
|
|
612
|
+
return `<!DOCTYPE html>
|
|
613
|
+
<html>
|
|
614
|
+
<body>
|
|
615
|
+
${String(html)}
|
|
616
|
+
</body>
|
|
617
|
+
</html>`;
|
|
618
|
+
}
|
|
619
|
+
/**
|
|
620
|
+
* Parse HTML
|
|
621
|
+
*/
|
|
622
|
+
async parseHtml(html) {
|
|
623
|
+
const { window } = createJSDOM(html);
|
|
624
|
+
return window.document;
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
const CHILD_LIMIT = 5;
|
|
628
|
+
const SIBLING_LIMIT = 2;
|
|
629
|
+
class DocumentRetrieverService {
|
|
630
|
+
documentStore;
|
|
631
|
+
constructor(documentStore) {
|
|
632
|
+
this.documentStore = documentStore;
|
|
633
|
+
}
|
|
634
|
+
/**
|
|
635
|
+
* Collects all related chunk IDs for a given initial hit.
|
|
636
|
+
* Returns an object with url, hitId, relatedIds (Set), and score.
|
|
637
|
+
*/
|
|
638
|
+
async getRelatedChunkIds(library, version, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
|
|
639
|
+
const id = doc.id;
|
|
640
|
+
const url = doc.metadata.url;
|
|
641
|
+
const score = doc.metadata.score;
|
|
642
|
+
const relatedIds = /* @__PURE__ */ new Set();
|
|
643
|
+
relatedIds.add(id);
|
|
644
|
+
const parent = await this.documentStore.findParentChunk(library, version, id);
|
|
645
|
+
if (parent) {
|
|
646
|
+
relatedIds.add(parent.id);
|
|
647
|
+
}
|
|
648
|
+
const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
|
|
649
|
+
library,
|
|
650
|
+
version,
|
|
651
|
+
id,
|
|
652
|
+
siblingLimit
|
|
653
|
+
);
|
|
654
|
+
for (const sib of precedingSiblings) {
|
|
655
|
+
relatedIds.add(sib.id);
|
|
656
|
+
}
|
|
657
|
+
const childChunks = await this.documentStore.findChildChunks(
|
|
658
|
+
library,
|
|
659
|
+
version,
|
|
660
|
+
id,
|
|
661
|
+
childLimit
|
|
662
|
+
);
|
|
663
|
+
for (const child of childChunks) {
|
|
664
|
+
relatedIds.add(child.id);
|
|
665
|
+
}
|
|
666
|
+
const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
|
|
667
|
+
library,
|
|
668
|
+
version,
|
|
669
|
+
id,
|
|
670
|
+
siblingLimit
|
|
671
|
+
);
|
|
672
|
+
for (const sib of subsequentSiblings) {
|
|
673
|
+
relatedIds.add(sib.id);
|
|
674
|
+
}
|
|
675
|
+
return { url, hitId: id, relatedIds, score };
|
|
676
|
+
}
|
|
677
|
+
/**
|
|
678
|
+
* Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
|
|
679
|
+
*/
|
|
680
|
+
groupAndPrepareFetch(relatedInfos) {
|
|
681
|
+
const urlMap = /* @__PURE__ */ new Map();
|
|
682
|
+
for (const info of relatedInfos) {
|
|
683
|
+
let entry = urlMap.get(info.url);
|
|
684
|
+
if (!entry) {
|
|
685
|
+
entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
|
|
686
|
+
urlMap.set(info.url, entry);
|
|
687
|
+
}
|
|
688
|
+
for (const id of info.relatedIds) {
|
|
689
|
+
entry.uniqueChunkIds.add(id);
|
|
690
|
+
}
|
|
691
|
+
if (info.score > entry.maxScore) {
|
|
692
|
+
entry.maxScore = info.score;
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
return urlMap;
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Finalizes the merged result for a URL group by fetching, sorting, and joining content.
|
|
699
|
+
*/
|
|
700
|
+
async finalizeResult(library, version, url, uniqueChunkIds, maxScore) {
|
|
701
|
+
const ids = Array.from(uniqueChunkIds);
|
|
702
|
+
const docs = await this.documentStore.findChunksByIds(library, version, ids);
|
|
703
|
+
const content = docs.map((d) => d.pageContent).join("\n\n");
|
|
704
|
+
return {
|
|
705
|
+
url,
|
|
706
|
+
content,
|
|
707
|
+
score: maxScore
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
/**
|
|
711
|
+
* Searches for documents and expands the context around the matches.
|
|
712
|
+
* @param library The library name.
|
|
713
|
+
* @param version The library version.
|
|
714
|
+
* @param query The search query.
|
|
715
|
+
* @param version The library version (optional, defaults to searching documents without a version).
|
|
716
|
+
* @param query The search query.
|
|
717
|
+
* @param limit The optional limit for the initial search results.
|
|
718
|
+
* @returns An array of strings representing the aggregated content of the retrieved chunks.
|
|
719
|
+
*/
|
|
720
|
+
async search(library, version, query, limit) {
|
|
721
|
+
const normalizedVersion = (version ?? "").toLowerCase();
|
|
722
|
+
const initialResults = await this.documentStore.findByContent(
|
|
723
|
+
library,
|
|
724
|
+
normalizedVersion,
|
|
725
|
+
query,
|
|
726
|
+
limit ?? 10
|
|
727
|
+
);
|
|
728
|
+
const relatedInfos = await Promise.all(
|
|
729
|
+
initialResults.map(
|
|
730
|
+
(doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
|
|
731
|
+
)
|
|
732
|
+
);
|
|
733
|
+
const urlMap = this.groupAndPrepareFetch(relatedInfos);
|
|
734
|
+
const results = [];
|
|
735
|
+
for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
|
|
736
|
+
const result = await this.finalizeResult(
|
|
737
|
+
library,
|
|
738
|
+
normalizedVersion,
|
|
739
|
+
url,
|
|
740
|
+
uniqueChunkIds,
|
|
741
|
+
maxScore
|
|
742
|
+
);
|
|
743
|
+
results.push(result);
|
|
744
|
+
}
|
|
745
|
+
return results;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
class DocumentStore {
|
|
749
|
+
db;
|
|
750
|
+
embeddings;
|
|
751
|
+
dbDimension = VECTOR_DIMENSION;
|
|
752
|
+
modelDimension;
|
|
753
|
+
statements;
|
|
754
|
+
/**
|
|
755
|
+
* Calculates Reciprocal Rank Fusion score for a result
|
|
756
|
+
*/
|
|
757
|
+
calculateRRF(vecRank, ftsRank, k = 60) {
|
|
758
|
+
let rrf = 0;
|
|
759
|
+
if (vecRank !== void 0) {
|
|
760
|
+
rrf += 1 / (k + vecRank);
|
|
761
|
+
}
|
|
762
|
+
if (ftsRank !== void 0) {
|
|
763
|
+
rrf += 1 / (k + ftsRank);
|
|
764
|
+
}
|
|
765
|
+
return rrf;
|
|
766
|
+
}
|
|
767
|
+
/**
|
|
768
|
+
* Assigns ranks to search results based on their scores
|
|
769
|
+
*/
|
|
770
|
+
assignRanks(results) {
|
|
771
|
+
const vecRanks = /* @__PURE__ */ new Map();
|
|
772
|
+
const ftsRanks = /* @__PURE__ */ new Map();
|
|
773
|
+
results.filter((r) => r.vec_score !== void 0).sort((a, b) => (b.vec_score ?? 0) - (a.vec_score ?? 0)).forEach((result, index) => {
|
|
774
|
+
vecRanks.set(Number(result.id), index + 1);
|
|
775
|
+
});
|
|
776
|
+
results.filter((r) => r.fts_score !== void 0).sort((a, b) => (b.fts_score ?? 0) - (a.fts_score ?? 0)).forEach((result, index) => {
|
|
777
|
+
ftsRanks.set(Number(result.id), index + 1);
|
|
778
|
+
});
|
|
779
|
+
return results.map((result) => ({
|
|
780
|
+
...result,
|
|
781
|
+
vec_rank: vecRanks.get(Number(result.id)),
|
|
782
|
+
fts_rank: ftsRanks.get(Number(result.id)),
|
|
783
|
+
rrf_score: this.calculateRRF(
|
|
784
|
+
vecRanks.get(Number(result.id)),
|
|
785
|
+
ftsRanks.get(Number(result.id))
|
|
786
|
+
)
|
|
787
|
+
}));
|
|
788
|
+
}
|
|
789
|
+
constructor(dbPath) {
|
|
790
|
+
if (!dbPath) {
|
|
791
|
+
throw new StoreError("Missing required database path");
|
|
792
|
+
}
|
|
793
|
+
this.db = new Database(dbPath);
|
|
794
|
+
}
|
|
795
|
+
/**
|
|
796
|
+
* Sets up prepared statements for database queries
|
|
797
|
+
*/
|
|
798
|
+
prepareStatements() {
|
|
799
|
+
const statements = {
|
|
800
|
+
getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
|
|
801
|
+
insertDocument: this.db.prepare(
|
|
802
|
+
"INSERT INTO documents (library_id, version_id, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
803
|
+
),
|
|
804
|
+
insertEmbedding: this.db.prepare(
|
|
805
|
+
"INSERT INTO documents_vec (rowid, library_id, version_id, embedding) VALUES (?, ?, ?, ?)"
|
|
806
|
+
),
|
|
807
|
+
insertLibrary: this.db.prepare(
|
|
808
|
+
"INSERT INTO libraries (name) VALUES (?) ON CONFLICT(name) DO NOTHING"
|
|
809
|
+
),
|
|
810
|
+
getLibraryIdByName: this.db.prepare(
|
|
811
|
+
"SELECT id FROM libraries WHERE name = ?"
|
|
812
|
+
),
|
|
813
|
+
// New version-related statements
|
|
814
|
+
insertVersion: this.db.prepare(
|
|
815
|
+
"INSERT INTO versions (library_id, name, status) VALUES (?, ?, 'not_indexed') ON CONFLICT(library_id, name) DO NOTHING"
|
|
816
|
+
),
|
|
817
|
+
resolveVersionId: this.db.prepare(
|
|
818
|
+
"SELECT id FROM versions WHERE library_id = ? AND name IS ?"
|
|
819
|
+
),
|
|
820
|
+
getVersionById: this.db.prepare("SELECT * FROM versions WHERE id = ?"),
|
|
821
|
+
queryVersionsByLibraryId: this.db.prepare(
|
|
822
|
+
"SELECT * FROM versions WHERE library_id = ? ORDER BY name"
|
|
823
|
+
),
|
|
824
|
+
deleteLibraryDocuments: this.db.prepare(
|
|
825
|
+
`DELETE FROM documents
|
|
826
|
+
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
827
|
+
AND version_id = (
|
|
828
|
+
SELECT v.id FROM versions v
|
|
829
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
830
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
831
|
+
)`
|
|
832
|
+
),
|
|
833
|
+
deleteDocuments: this.db.prepare(
|
|
834
|
+
`DELETE FROM documents
|
|
835
|
+
WHERE library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
836
|
+
AND version_id = (
|
|
837
|
+
SELECT v.id FROM versions v
|
|
838
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
839
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
840
|
+
)`
|
|
841
|
+
),
|
|
842
|
+
deleteDocumentsByUrl: this.db.prepare(
|
|
843
|
+
`DELETE FROM documents
|
|
844
|
+
WHERE url = ?
|
|
845
|
+
AND library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
846
|
+
AND version_id = (
|
|
847
|
+
SELECT v.id FROM versions v
|
|
848
|
+
WHERE v.library_id = (SELECT id FROM libraries WHERE name = ?)
|
|
849
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
850
|
+
)`
|
|
851
|
+
),
|
|
852
|
+
getDocumentBySort: this.db.prepare(
|
|
853
|
+
`SELECT d.id
|
|
854
|
+
FROM documents d
|
|
855
|
+
JOIN versions v ON d.version_id = v.id
|
|
856
|
+
JOIN libraries l ON v.library_id = l.id
|
|
857
|
+
WHERE l.name = ?
|
|
858
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
859
|
+
LIMIT 1`
|
|
860
|
+
),
|
|
861
|
+
queryVersions: this.db.prepare(
|
|
862
|
+
`SELECT DISTINCT v.name
|
|
863
|
+
FROM versions v
|
|
864
|
+
JOIN libraries l ON v.library_id = l.id
|
|
865
|
+
WHERE l.name = ?
|
|
866
|
+
ORDER BY v.name`
|
|
867
|
+
),
|
|
868
|
+
checkExists: this.db.prepare(
|
|
869
|
+
`SELECT d.id FROM documents d
|
|
870
|
+
JOIN versions v ON d.version_id = v.id
|
|
871
|
+
JOIN libraries l ON v.library_id = l.id
|
|
872
|
+
WHERE l.name = ?
|
|
873
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
874
|
+
LIMIT 1`
|
|
875
|
+
),
|
|
876
|
+
// Library/version aggregation including versions without documents and status/progress fields
|
|
877
|
+
queryLibraryVersions: this.db.prepare(
|
|
878
|
+
`SELECT
|
|
879
|
+
l.name as library,
|
|
880
|
+
COALESCE(v.name, '') as version,
|
|
881
|
+
v.id as versionId,
|
|
882
|
+
v.status as status,
|
|
883
|
+
v.progress_pages as progressPages,
|
|
884
|
+
v.progress_max_pages as progressMaxPages,
|
|
885
|
+
v.source_url as sourceUrl,
|
|
886
|
+
MIN(d.indexed_at) as indexedAt,
|
|
887
|
+
COUNT(d.id) as documentCount,
|
|
888
|
+
COUNT(DISTINCT d.url) as uniqueUrlCount
|
|
889
|
+
FROM versions v
|
|
890
|
+
JOIN libraries l ON v.library_id = l.id
|
|
891
|
+
LEFT JOIN documents d ON d.version_id = v.id
|
|
892
|
+
GROUP BY v.id
|
|
893
|
+
ORDER BY l.name, version`
|
|
894
|
+
),
|
|
895
|
+
getChildChunks: this.db.prepare(`
|
|
896
|
+
SELECT d.* FROM documents d
|
|
897
|
+
JOIN versions v ON d.version_id = v.id
|
|
898
|
+
JOIN libraries l ON v.library_id = l.id
|
|
899
|
+
WHERE l.name = ?
|
|
900
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
901
|
+
AND d.url = ?
|
|
902
|
+
AND json_array_length(json_extract(d.metadata, '$.path')) = ?
|
|
903
|
+
AND json_extract(d.metadata, '$.path') LIKE ? || '%'
|
|
904
|
+
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
905
|
+
ORDER BY d.sort_order
|
|
906
|
+
LIMIT ?
|
|
907
|
+
`),
|
|
908
|
+
getPrecedingSiblings: this.db.prepare(`
|
|
909
|
+
SELECT d.* FROM documents d
|
|
910
|
+
JOIN versions v ON d.version_id = v.id
|
|
911
|
+
JOIN libraries l ON v.library_id = l.id
|
|
912
|
+
WHERE l.name = ?
|
|
913
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
914
|
+
AND d.url = ?
|
|
915
|
+
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
916
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
917
|
+
ORDER BY d.sort_order DESC
|
|
918
|
+
LIMIT ?
|
|
919
|
+
`),
|
|
920
|
+
getSubsequentSiblings: this.db.prepare(`
|
|
921
|
+
SELECT d.* FROM documents d
|
|
922
|
+
JOIN versions v ON d.version_id = v.id
|
|
923
|
+
JOIN libraries l ON v.library_id = l.id
|
|
924
|
+
WHERE l.name = ?
|
|
925
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
926
|
+
AND d.url = ?
|
|
927
|
+
AND d.sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
928
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
929
|
+
ORDER BY d.sort_order
|
|
930
|
+
LIMIT ?
|
|
931
|
+
`),
|
|
932
|
+
getParentChunk: this.db.prepare(`
|
|
933
|
+
SELECT d.* FROM documents d
|
|
934
|
+
JOIN versions v ON d.version_id = v.id
|
|
935
|
+
JOIN libraries l ON v.library_id = l.id
|
|
936
|
+
WHERE l.name = ?
|
|
937
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
938
|
+
AND d.url = ?
|
|
939
|
+
AND json_extract(d.metadata, '$.path') = ?
|
|
940
|
+
AND d.sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
941
|
+
ORDER BY d.sort_order DESC
|
|
942
|
+
LIMIT 1
|
|
943
|
+
`),
|
|
944
|
+
// Status tracking statements
|
|
945
|
+
updateVersionStatus: this.db.prepare(
|
|
946
|
+
"UPDATE versions SET status = ?, error_message = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
947
|
+
),
|
|
948
|
+
updateVersionProgress: this.db.prepare(
|
|
949
|
+
"UPDATE versions SET progress_pages = ?, progress_max_pages = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
950
|
+
),
|
|
951
|
+
getVersionsByStatus: this.db.prepare(
|
|
952
|
+
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.status IN (SELECT value FROM json_each(?))"
|
|
953
|
+
),
|
|
954
|
+
// Scraper options statements
|
|
955
|
+
updateVersionScraperOptions: this.db.prepare(
|
|
956
|
+
"UPDATE versions SET source_url = ?, scraper_options = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?"
|
|
957
|
+
),
|
|
958
|
+
getVersionWithOptions: this.db.prepare(
|
|
959
|
+
"SELECT * FROM versions WHERE id = ?"
|
|
960
|
+
),
|
|
961
|
+
getVersionsBySourceUrl: this.db.prepare(
|
|
962
|
+
"SELECT v.*, l.name as library_name FROM versions v JOIN libraries l ON v.library_id = l.id WHERE v.source_url = ? ORDER BY v.created_at DESC"
|
|
963
|
+
)
|
|
964
|
+
};
|
|
965
|
+
this.statements = statements;
|
|
966
|
+
}
|
|
967
|
+
/**
|
|
968
|
+
* Pads a vector to the fixed database dimension by appending zeros.
|
|
969
|
+
* Throws an error if the input vector is longer than the database dimension.
|
|
970
|
+
*/
|
|
971
|
+
padVector(vector) {
|
|
972
|
+
if (vector.length > this.dbDimension) {
|
|
973
|
+
throw new Error(
|
|
974
|
+
`Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
|
|
975
|
+
);
|
|
976
|
+
}
|
|
977
|
+
if (vector.length === this.dbDimension) {
|
|
978
|
+
return vector;
|
|
979
|
+
}
|
|
980
|
+
return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* Initializes embeddings client using environment variables for configuration.
|
|
984
|
+
*
|
|
985
|
+
* The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
|
|
986
|
+
* Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
|
|
987
|
+
* for OpenAI (default).
|
|
988
|
+
*
|
|
989
|
+
* Supported providers and their required environment variables:
|
|
990
|
+
* - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
|
|
991
|
+
* - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
|
|
992
|
+
* - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
|
|
993
|
+
* - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
|
|
994
|
+
*/
|
|
995
|
+
async initializeEmbeddings() {
|
|
996
|
+
const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
997
|
+
const { createEmbeddingModel } = await import("./EmbeddingFactory-CElwVk3X.js");
|
|
998
|
+
this.embeddings = createEmbeddingModel(modelSpec);
|
|
999
|
+
const testVector = await this.embeddings.embedQuery("test");
|
|
1000
|
+
this.modelDimension = testVector.length;
|
|
1001
|
+
if (this.modelDimension > this.dbDimension) {
|
|
1002
|
+
throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
/**
|
|
1006
|
+
* Escapes a query string for use with SQLite FTS5 MATCH operator.
|
|
1007
|
+
* Wraps the query in double quotes and escapes internal double quotes.
|
|
1008
|
+
*/
|
|
1009
|
+
escapeFtsQuery(query) {
|
|
1010
|
+
const escapedQuotes = query.replace(/"/g, '""');
|
|
1011
|
+
return `"${escapedQuotes}"`;
|
|
1012
|
+
}
|
|
1013
|
+
/**
|
|
1014
|
+
* Initializes database connection and ensures readiness
|
|
1015
|
+
*/
|
|
1016
|
+
async initialize() {
|
|
1017
|
+
try {
|
|
1018
|
+
sqliteVec.load(this.db);
|
|
1019
|
+
applyMigrations(this.db);
|
|
1020
|
+
this.prepareStatements();
|
|
1021
|
+
await this.initializeEmbeddings();
|
|
1022
|
+
} catch (error) {
|
|
1023
|
+
if (error instanceof StoreError) {
|
|
1024
|
+
throw error;
|
|
1025
|
+
}
|
|
1026
|
+
throw new ConnectionError("Failed to initialize database connection", error);
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
/**
|
|
1030
|
+
* Gracefully closes database connections
|
|
1031
|
+
*/
|
|
1032
|
+
async shutdown() {
|
|
1033
|
+
this.db.close();
|
|
1034
|
+
}
|
|
1035
|
+
/**
|
|
1036
|
+
* Resolves a library name and version string to library_id and version_id.
|
|
1037
|
+
* Creates library and version records if they don't exist.
|
|
1038
|
+
*/
|
|
1039
|
+
async resolveLibraryAndVersionIds(library, version) {
|
|
1040
|
+
const normalizedLibrary = library.toLowerCase();
|
|
1041
|
+
const normalizedVersion = denormalizeVersionName(version.toLowerCase());
|
|
1042
|
+
this.statements.insertLibrary.run(normalizedLibrary);
|
|
1043
|
+
const libraryIdRow = this.statements.getLibraryIdByName.get(normalizedLibrary);
|
|
1044
|
+
if (!libraryIdRow || typeof libraryIdRow.id !== "number") {
|
|
1045
|
+
throw new StoreError(`Failed to resolve library_id for library: ${library}`);
|
|
1046
|
+
}
|
|
1047
|
+
const libraryId = libraryIdRow.id;
|
|
1048
|
+
this.statements.insertVersion.run(libraryId, normalizedVersion);
|
|
1049
|
+
const versionIdRow = this.statements.resolveVersionId.get(
|
|
1050
|
+
libraryId,
|
|
1051
|
+
normalizedVersion === null ? "" : normalizedVersion
|
|
1052
|
+
);
|
|
1053
|
+
if (!versionIdRow || typeof versionIdRow.id !== "number") {
|
|
1054
|
+
throw new StoreError(
|
|
1055
|
+
`Failed to resolve version_id for library: ${library}, version: ${version}`
|
|
1056
|
+
);
|
|
1057
|
+
}
|
|
1058
|
+
return { libraryId, versionId: versionIdRow.id };
|
|
1059
|
+
}
|
|
1060
|
+
/**
|
|
1061
|
+
* Retrieves all unique versions for a specific library
|
|
1062
|
+
*/
|
|
1063
|
+
async queryUniqueVersions(library) {
|
|
1064
|
+
try {
|
|
1065
|
+
const rows = this.statements.queryVersions.all(library.toLowerCase());
|
|
1066
|
+
return rows.map((row) => normalizeVersionName(row.name));
|
|
1067
|
+
} catch (error) {
|
|
1068
|
+
throw new ConnectionError("Failed to query versions", error);
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
/**
|
|
1072
|
+
* Updates the status of a version record in the database.
|
|
1073
|
+
* @param versionId The version ID to update
|
|
1074
|
+
* @param status The new status to set
|
|
1075
|
+
* @param errorMessage Optional error message for failed statuses
|
|
1076
|
+
*/
|
|
1077
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
1078
|
+
try {
|
|
1079
|
+
this.statements.updateVersionStatus.run(status, errorMessage ?? null, versionId);
|
|
1080
|
+
} catch (error) {
|
|
1081
|
+
throw new StoreError(`Failed to update version status: ${error}`);
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
/**
|
|
1085
|
+
* Updates the progress counters for a version being indexed.
|
|
1086
|
+
* @param versionId The version ID to update
|
|
1087
|
+
* @param pages Current number of pages processed
|
|
1088
|
+
* @param maxPages Total number of pages to process
|
|
1089
|
+
*/
|
|
1090
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
1091
|
+
try {
|
|
1092
|
+
this.statements.updateVersionProgress.run(pages, maxPages, versionId);
|
|
1093
|
+
} catch (error) {
|
|
1094
|
+
throw new StoreError(`Failed to update version progress: ${error}`);
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
/**
|
|
1098
|
+
* Retrieves versions by their status.
|
|
1099
|
+
* @param statuses Array of statuses to filter by
|
|
1100
|
+
* @returns Array of version records matching the statuses
|
|
1101
|
+
*/
|
|
1102
|
+
async getVersionsByStatus(statuses) {
|
|
1103
|
+
try {
|
|
1104
|
+
const statusJson = JSON.stringify(statuses);
|
|
1105
|
+
const rows = this.statements.getVersionsByStatus.all(
|
|
1106
|
+
statusJson
|
|
1107
|
+
);
|
|
1108
|
+
return rows;
|
|
1109
|
+
} catch (error) {
|
|
1110
|
+
throw new StoreError(`Failed to get versions by status: ${error}`);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
/**
|
|
1114
|
+
* Stores scraper options for a version to enable reproducible indexing.
|
|
1115
|
+
* @param versionId The version ID to update
|
|
1116
|
+
* @param options Complete scraper options used for indexing
|
|
1117
|
+
*/
|
|
1118
|
+
async storeScraperOptions(versionId, options) {
|
|
1119
|
+
try {
|
|
1120
|
+
const { url: source_url, library, version, signal, ...scraper_options } = options;
|
|
1121
|
+
const optionsJson = JSON.stringify(scraper_options);
|
|
1122
|
+
this.statements.updateVersionScraperOptions.run(source_url, optionsJson, versionId);
|
|
1123
|
+
} catch (error) {
|
|
1124
|
+
throw new StoreError(`Failed to store scraper options: ${error}`);
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
/**
|
|
1128
|
+
* Retrieves stored scraping configuration (source URL and options) for a version.
|
|
1129
|
+
* Returns null when no source URL is recorded (not re-indexable).
|
|
1130
|
+
*/
|
|
1131
|
+
async getScraperOptions(versionId) {
|
|
1132
|
+
try {
|
|
1133
|
+
const row = this.statements.getVersionWithOptions.get(versionId);
|
|
1134
|
+
if (!row?.source_url) {
|
|
1135
|
+
return null;
|
|
1136
|
+
}
|
|
1137
|
+
let parsed = {};
|
|
1138
|
+
if (row.scraper_options) {
|
|
1139
|
+
try {
|
|
1140
|
+
parsed = JSON.parse(row.scraper_options);
|
|
1141
|
+
} catch (e) {
|
|
1142
|
+
logger.warn(`â ī¸ Invalid scraper_options JSON for version ${versionId}: ${e}`);
|
|
1143
|
+
parsed = {};
|
|
1144
|
+
}
|
|
1145
|
+
}
|
|
1146
|
+
return { sourceUrl: row.source_url, options: parsed };
|
|
1147
|
+
} catch (error) {
|
|
1148
|
+
throw new StoreError(`Failed to get scraper options: ${error}`);
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
/**
|
|
1152
|
+
* Finds versions that were indexed from the same source URL.
|
|
1153
|
+
* Useful for finding similar configurations or detecting duplicates.
|
|
1154
|
+
* @param url Source URL to search for
|
|
1155
|
+
* @returns Array of versions with the same source URL
|
|
1156
|
+
*/
|
|
1157
|
+
async findVersionsBySourceUrl(url) {
|
|
1158
|
+
try {
|
|
1159
|
+
const rows = this.statements.getVersionsBySourceUrl.all(
|
|
1160
|
+
url
|
|
1161
|
+
);
|
|
1162
|
+
return rows;
|
|
1163
|
+
} catch (error) {
|
|
1164
|
+
throw new StoreError(`Failed to find versions by source URL: ${error}`);
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
/**
|
|
1168
|
+
* Verifies existence of documents for a specific library version
|
|
1169
|
+
*/
|
|
1170
|
+
async checkDocumentExists(library, version) {
|
|
1171
|
+
try {
|
|
1172
|
+
const normalizedVersion = version.toLowerCase();
|
|
1173
|
+
const result = this.statements.checkExists.get(
|
|
1174
|
+
library.toLowerCase(),
|
|
1175
|
+
normalizedVersion
|
|
1176
|
+
);
|
|
1177
|
+
return result !== void 0;
|
|
1178
|
+
} catch (error) {
|
|
1179
|
+
throw new ConnectionError("Failed to check document existence", error);
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
/**
|
|
1183
|
+
* Retrieves a mapping of all libraries to their available versions with details.
|
|
1184
|
+
*/
|
|
1185
|
+
async queryLibraryVersions() {
|
|
1186
|
+
try {
|
|
1187
|
+
const rows = this.statements.queryLibraryVersions.all();
|
|
1188
|
+
const libraryMap = /* @__PURE__ */ new Map();
|
|
1189
|
+
for (const row of rows) {
|
|
1190
|
+
const library = row.library;
|
|
1191
|
+
if (!libraryMap.has(library)) {
|
|
1192
|
+
libraryMap.set(library, []);
|
|
1193
|
+
}
|
|
1194
|
+
const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
|
|
1195
|
+
libraryMap.get(library)?.push({
|
|
1196
|
+
version: row.version,
|
|
1197
|
+
versionId: row.versionId,
|
|
1198
|
+
// Preserve raw string status here; DocumentManagementService will cast to VersionStatus
|
|
1199
|
+
status: row.status,
|
|
1200
|
+
progressPages: row.progressPages,
|
|
1201
|
+
progressMaxPages: row.progressMaxPages,
|
|
1202
|
+
sourceUrl: row.sourceUrl,
|
|
1203
|
+
documentCount: row.documentCount,
|
|
1204
|
+
uniqueUrlCount: row.uniqueUrlCount,
|
|
1205
|
+
indexedAt: indexedAtISO
|
|
1206
|
+
});
|
|
1207
|
+
}
|
|
1208
|
+
for (const versions of libraryMap.values()) {
|
|
1209
|
+
versions.sort((a, b) => {
|
|
1210
|
+
if (a.version === "" && b.version !== "") {
|
|
1211
|
+
return -1;
|
|
1212
|
+
}
|
|
1213
|
+
if (a.version !== "" && b.version === "") {
|
|
1214
|
+
return 1;
|
|
1215
|
+
}
|
|
1216
|
+
if (a.version === "" && b.version === "") {
|
|
1217
|
+
return 0;
|
|
1218
|
+
}
|
|
1219
|
+
try {
|
|
1220
|
+
return semver__default.compare(a.version, b.version);
|
|
1221
|
+
} catch (_error) {
|
|
1222
|
+
return a.version.localeCompare(b.version);
|
|
1223
|
+
}
|
|
1224
|
+
});
|
|
1225
|
+
}
|
|
1226
|
+
return libraryMap;
|
|
1227
|
+
} catch (error) {
|
|
1228
|
+
throw new ConnectionError("Failed to query library versions", error);
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
/**
|
|
1232
|
+
* Stores documents with library and version metadata, generating embeddings
|
|
1233
|
+
* for vector similarity search. Automatically removes any existing documents
|
|
1234
|
+
* for the same URLs before adding new ones to prevent UNIQUE constraint violations.
|
|
1235
|
+
*/
|
|
1236
|
+
async addDocuments(library, version, documents) {
|
|
1237
|
+
try {
|
|
1238
|
+
if (documents.length === 0) {
|
|
1239
|
+
return;
|
|
1240
|
+
}
|
|
1241
|
+
const urls = /* @__PURE__ */ new Set();
|
|
1242
|
+
for (const doc of documents) {
|
|
1243
|
+
const url = doc.metadata.url;
|
|
1244
|
+
if (!url || typeof url !== "string" || !url.trim()) {
|
|
1245
|
+
throw new StoreError("Document metadata must include a valid URL");
|
|
1246
|
+
}
|
|
1247
|
+
urls.add(url);
|
|
1248
|
+
}
|
|
1249
|
+
const texts = documents.map((doc) => {
|
|
1250
|
+
const header = `<title>${doc.metadata.title}</title>
|
|
1251
|
+
<url>${doc.metadata.url}</url>
|
|
1252
|
+
<path>${doc.metadata.path.join(" / ")}</path>
|
|
1253
|
+
`;
|
|
1254
|
+
return `${header}${doc.pageContent}`;
|
|
1255
|
+
});
|
|
1256
|
+
const maxBatchChars = Number(process.env.DOCS_MCP_EMBEDDING_BATCH_CHARS) || EMBEDDING_BATCH_CHARS;
|
|
1257
|
+
const rawEmbeddings = [];
|
|
1258
|
+
let currentBatch = [];
|
|
1259
|
+
let currentBatchSize = 0;
|
|
1260
|
+
let batchCount = 0;
|
|
1261
|
+
for (const text of texts) {
|
|
1262
|
+
const textSize = text.length;
|
|
1263
|
+
if (currentBatchSize + textSize > maxBatchChars && currentBatch.length > 0) {
|
|
1264
|
+
batchCount++;
|
|
1265
|
+
logger.debug(
|
|
1266
|
+
`đ Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
1267
|
+
);
|
|
1268
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
1269
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
1270
|
+
currentBatch = [];
|
|
1271
|
+
currentBatchSize = 0;
|
|
1272
|
+
}
|
|
1273
|
+
currentBatch.push(text);
|
|
1274
|
+
currentBatchSize += textSize;
|
|
1275
|
+
if (currentBatch.length >= EMBEDDING_BATCH_SIZE) {
|
|
1276
|
+
batchCount++;
|
|
1277
|
+
logger.debug(
|
|
1278
|
+
`đ Processing embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
1279
|
+
);
|
|
1280
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
1281
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
1282
|
+
currentBatch = [];
|
|
1283
|
+
currentBatchSize = 0;
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1286
|
+
if (currentBatch.length > 0) {
|
|
1287
|
+
batchCount++;
|
|
1288
|
+
logger.debug(
|
|
1289
|
+
`đ Processing final embedding batch ${batchCount}: ${currentBatch.length} texts, ${currentBatchSize} chars`
|
|
1290
|
+
);
|
|
1291
|
+
const batchEmbeddings = await this.embeddings.embedDocuments(currentBatch);
|
|
1292
|
+
rawEmbeddings.push(...batchEmbeddings);
|
|
1293
|
+
}
|
|
1294
|
+
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
1295
|
+
const { libraryId, versionId } = await this.resolveLibraryAndVersionIds(
|
|
1296
|
+
library,
|
|
1297
|
+
version
|
|
1298
|
+
);
|
|
1299
|
+
for (const url of urls) {
|
|
1300
|
+
const deletedCount = await this.deleteDocumentsByUrl(library, version, url);
|
|
1301
|
+
if (deletedCount > 0) {
|
|
1302
|
+
logger.debug(`đī¸ Deleted ${deletedCount} existing documents for URL: ${url}`);
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
const transaction = this.db.transaction((docs) => {
|
|
1306
|
+
for (let i = 0; i < docs.length; i++) {
|
|
1307
|
+
const doc = docs[i];
|
|
1308
|
+
const url = doc.metadata.url;
|
|
1309
|
+
const result = this.statements.insertDocument.run(
|
|
1310
|
+
BigInt(libraryId),
|
|
1311
|
+
BigInt(versionId),
|
|
1312
|
+
url,
|
|
1313
|
+
doc.pageContent,
|
|
1314
|
+
JSON.stringify(doc.metadata),
|
|
1315
|
+
i,
|
|
1316
|
+
(/* @__PURE__ */ new Date()).toISOString()
|
|
1317
|
+
// Pass current timestamp for indexed_at
|
|
1318
|
+
);
|
|
1319
|
+
const rowId = result.lastInsertRowid;
|
|
1320
|
+
this.statements.insertEmbedding.run(
|
|
1321
|
+
BigInt(rowId),
|
|
1322
|
+
BigInt(libraryId),
|
|
1323
|
+
BigInt(versionId),
|
|
1324
|
+
JSON.stringify(paddedEmbeddings[i])
|
|
1325
|
+
);
|
|
1326
|
+
}
|
|
1327
|
+
});
|
|
1328
|
+
transaction(documents);
|
|
1329
|
+
} catch (error) {
|
|
1330
|
+
throw new ConnectionError("Failed to add documents to store", error);
|
|
1331
|
+
}
|
|
1332
|
+
}
|
|
1333
|
+
/**
|
|
1334
|
+
* Removes documents matching specified library and version
|
|
1335
|
+
* @returns Number of documents deleted
|
|
1336
|
+
*/
|
|
1337
|
+
async deleteDocuments(library, version) {
|
|
1338
|
+
try {
|
|
1339
|
+
const normalizedVersion = version.toLowerCase();
|
|
1340
|
+
const result = this.statements.deleteDocuments.run(
|
|
1341
|
+
library.toLowerCase(),
|
|
1342
|
+
library.toLowerCase(),
|
|
1343
|
+
// library name appears twice in the query
|
|
1344
|
+
normalizedVersion
|
|
1345
|
+
);
|
|
1346
|
+
return result.changes;
|
|
1347
|
+
} catch (error) {
|
|
1348
|
+
throw new ConnectionError("Failed to delete documents", error);
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
/**
|
|
1352
|
+
* Removes documents for a specific URL within a library and version
|
|
1353
|
+
* @returns Number of documents deleted
|
|
1354
|
+
*/
|
|
1355
|
+
async deleteDocumentsByUrl(library, version, url) {
|
|
1356
|
+
try {
|
|
1357
|
+
const normalizedVersion = version.toLowerCase();
|
|
1358
|
+
const result = this.statements.deleteDocumentsByUrl.run(
|
|
1359
|
+
url,
|
|
1360
|
+
library.toLowerCase(),
|
|
1361
|
+
library.toLowerCase(),
|
|
1362
|
+
// library name appears twice in the query
|
|
1363
|
+
normalizedVersion
|
|
1364
|
+
);
|
|
1365
|
+
return result.changes;
|
|
1366
|
+
} catch (error) {
|
|
1367
|
+
throw new ConnectionError("Failed to delete documents by URL", error);
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
/**
|
|
1371
|
+
* Retrieves a document by its ID.
|
|
1372
|
+
* @param id The ID of the document.
|
|
1373
|
+
* @returns The document, or null if not found.
|
|
1374
|
+
*/
|
|
1375
|
+
async getById(id) {
|
|
1376
|
+
try {
|
|
1377
|
+
const row = this.statements.getById.get(BigInt(id));
|
|
1378
|
+
if (!row) {
|
|
1379
|
+
return null;
|
|
1380
|
+
}
|
|
1381
|
+
return mapDbDocumentToDocument(row);
|
|
1382
|
+
} catch (error) {
|
|
1383
|
+
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
/**
|
|
1387
|
+
* Finds documents matching a text query using hybrid search.
|
|
1388
|
+
* Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
|
|
1389
|
+
*/
|
|
1390
|
+
async findByContent(library, version, query, limit) {
|
|
1391
|
+
try {
|
|
1392
|
+
const rawEmbedding = await this.embeddings.embedQuery(query);
|
|
1393
|
+
const embedding = this.padVector(rawEmbedding);
|
|
1394
|
+
const ftsQuery = this.escapeFtsQuery(query);
|
|
1395
|
+
const normalizedVersion = version.toLowerCase();
|
|
1396
|
+
const stmt = this.db.prepare(`
|
|
1397
|
+
WITH vec_distances AS (
|
|
1398
|
+
SELECT
|
|
1399
|
+
dv.rowid as id,
|
|
1400
|
+
dv.distance as vec_distance
|
|
1401
|
+
FROM documents_vec dv
|
|
1402
|
+
JOIN versions v ON dv.version_id = v.id
|
|
1403
|
+
JOIN libraries l ON v.library_id = l.id
|
|
1404
|
+
WHERE l.name = ?
|
|
1405
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
1406
|
+
AND dv.embedding MATCH ?
|
|
1407
|
+
AND dv.k = ?
|
|
1408
|
+
ORDER BY dv.distance
|
|
1409
|
+
),
|
|
1410
|
+
fts_scores AS (
|
|
1411
|
+
SELECT
|
|
1412
|
+
f.rowid as id,
|
|
1413
|
+
bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
|
|
1414
|
+
FROM documents_fts f
|
|
1415
|
+
JOIN documents d ON f.rowid = d.id
|
|
1416
|
+
JOIN versions v ON d.version_id = v.id
|
|
1417
|
+
JOIN libraries l ON v.library_id = l.id
|
|
1418
|
+
WHERE l.name = ?
|
|
1419
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
1420
|
+
AND documents_fts MATCH ?
|
|
1421
|
+
ORDER BY fts_score
|
|
1422
|
+
LIMIT ?
|
|
1423
|
+
)
|
|
1424
|
+
SELECT
|
|
1425
|
+
d.id,
|
|
1426
|
+
d.content,
|
|
1427
|
+
d.metadata,
|
|
1428
|
+
COALESCE(1 / (1 + v.vec_distance), 0) as vec_score,
|
|
1429
|
+
COALESCE(-MIN(f.fts_score, 0), 0) as fts_score
|
|
1430
|
+
FROM documents d
|
|
1431
|
+
LEFT JOIN vec_distances v ON d.id = v.id
|
|
1432
|
+
LEFT JOIN fts_scores f ON d.id = f.id
|
|
1433
|
+
WHERE v.id IS NOT NULL OR f.id IS NOT NULL
|
|
1434
|
+
`);
|
|
1435
|
+
const rawResults = stmt.all(
|
|
1436
|
+
library.toLowerCase(),
|
|
1437
|
+
normalizedVersion,
|
|
1438
|
+
JSON.stringify(embedding),
|
|
1439
|
+
limit,
|
|
1440
|
+
library.toLowerCase(),
|
|
1441
|
+
normalizedVersion,
|
|
1442
|
+
ftsQuery,
|
|
1443
|
+
// Use the escaped query
|
|
1444
|
+
limit
|
|
1445
|
+
);
|
|
1446
|
+
const rankedResults = this.assignRanks(rawResults);
|
|
1447
|
+
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
1448
|
+
return topResults.map((row) => ({
|
|
1449
|
+
...mapDbDocumentToDocument(row),
|
|
1450
|
+
metadata: {
|
|
1451
|
+
...JSON.parse(row.metadata),
|
|
1452
|
+
id: row.id,
|
|
1453
|
+
score: row.rrf_score,
|
|
1454
|
+
vec_rank: row.vec_rank,
|
|
1455
|
+
fts_rank: row.fts_rank
|
|
1456
|
+
}
|
|
1457
|
+
}));
|
|
1458
|
+
} catch (error) {
|
|
1459
|
+
throw new ConnectionError(
|
|
1460
|
+
`Failed to find documents by content with query "${query}"`,
|
|
1461
|
+
error
|
|
1462
|
+
);
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
/**
|
|
1466
|
+
* Finds child chunks of a given document based on path hierarchy.
|
|
1467
|
+
*/
|
|
1468
|
+
async findChildChunks(library, version, id, limit) {
|
|
1469
|
+
try {
|
|
1470
|
+
const parent = await this.getById(id);
|
|
1471
|
+
if (!parent) {
|
|
1472
|
+
return [];
|
|
1473
|
+
}
|
|
1474
|
+
const parentPath = parent.metadata.path ?? [];
|
|
1475
|
+
const parentUrl = parent.metadata.url;
|
|
1476
|
+
const normalizedVersion = version.toLowerCase();
|
|
1477
|
+
const result = this.statements.getChildChunks.all(
|
|
1478
|
+
library.toLowerCase(),
|
|
1479
|
+
normalizedVersion,
|
|
1480
|
+
parentUrl,
|
|
1481
|
+
parentPath.length + 1,
|
|
1482
|
+
JSON.stringify(parentPath),
|
|
1483
|
+
BigInt(id),
|
|
1484
|
+
limit
|
|
1485
|
+
);
|
|
1486
|
+
return result.map((row) => mapDbDocumentToDocument(row));
|
|
1487
|
+
} catch (error) {
|
|
1488
|
+
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
/**
|
|
1492
|
+
* Finds preceding sibling chunks of a given document.
|
|
1493
|
+
*/
|
|
1494
|
+
async findPrecedingSiblingChunks(library, version, id, limit) {
|
|
1495
|
+
try {
|
|
1496
|
+
const reference = await this.getById(id);
|
|
1497
|
+
if (!reference) {
|
|
1498
|
+
return [];
|
|
1499
|
+
}
|
|
1500
|
+
const refMetadata = reference.metadata;
|
|
1501
|
+
const normalizedVersion = version.toLowerCase();
|
|
1502
|
+
const result = this.statements.getPrecedingSiblings.all(
|
|
1503
|
+
library.toLowerCase(),
|
|
1504
|
+
normalizedVersion,
|
|
1505
|
+
refMetadata.url,
|
|
1506
|
+
BigInt(id),
|
|
1507
|
+
JSON.stringify(refMetadata.path),
|
|
1508
|
+
limit
|
|
1509
|
+
);
|
|
1510
|
+
return result.reverse().map((row) => mapDbDocumentToDocument(row));
|
|
1511
|
+
} catch (error) {
|
|
1512
|
+
throw new ConnectionError(
|
|
1513
|
+
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
1514
|
+
error
|
|
1515
|
+
);
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
/**
|
|
1519
|
+
* Finds subsequent sibling chunks of a given document.
|
|
1520
|
+
*/
|
|
1521
|
+
async findSubsequentSiblingChunks(library, version, id, limit) {
|
|
1522
|
+
try {
|
|
1523
|
+
const reference = await this.getById(id);
|
|
1524
|
+
if (!reference) {
|
|
1525
|
+
return [];
|
|
1526
|
+
}
|
|
1527
|
+
const refMetadata = reference.metadata;
|
|
1528
|
+
const normalizedVersion = version.toLowerCase();
|
|
1529
|
+
const result = this.statements.getSubsequentSiblings.all(
|
|
1530
|
+
library.toLowerCase(),
|
|
1531
|
+
normalizedVersion,
|
|
1532
|
+
refMetadata.url,
|
|
1533
|
+
BigInt(id),
|
|
1534
|
+
JSON.stringify(refMetadata.path),
|
|
1535
|
+
limit
|
|
1536
|
+
);
|
|
1537
|
+
return result.map((row) => mapDbDocumentToDocument(row));
|
|
1538
|
+
} catch (error) {
|
|
1539
|
+
throw new ConnectionError(
|
|
1540
|
+
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
1541
|
+
error
|
|
1542
|
+
);
|
|
1543
|
+
}
|
|
1544
|
+
}
|
|
1545
|
+
/**
|
|
1546
|
+
* Finds the parent chunk of a given document.
|
|
1547
|
+
*/
|
|
1548
|
+
async findParentChunk(library, version, id) {
|
|
1549
|
+
try {
|
|
1550
|
+
const child = await this.getById(id);
|
|
1551
|
+
if (!child) {
|
|
1552
|
+
return null;
|
|
1553
|
+
}
|
|
1554
|
+
const childMetadata = child.metadata;
|
|
1555
|
+
const path2 = childMetadata.path ?? [];
|
|
1556
|
+
const parentPath = path2.slice(0, -1);
|
|
1557
|
+
if (parentPath.length === 0) {
|
|
1558
|
+
return null;
|
|
1559
|
+
}
|
|
1560
|
+
const normalizedVersion = version.toLowerCase();
|
|
1561
|
+
const result = this.statements.getParentChunk.get(
|
|
1562
|
+
library.toLowerCase(),
|
|
1563
|
+
normalizedVersion,
|
|
1564
|
+
childMetadata.url,
|
|
1565
|
+
JSON.stringify(parentPath),
|
|
1566
|
+
BigInt(id)
|
|
1567
|
+
);
|
|
1568
|
+
if (!result) {
|
|
1569
|
+
return null;
|
|
1570
|
+
}
|
|
1571
|
+
return mapDbDocumentToDocument(result);
|
|
1572
|
+
} catch (error) {
|
|
1573
|
+
throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
/**
|
|
1577
|
+
* Fetches multiple documents by their IDs in a single call.
|
|
1578
|
+
* Returns an array of Document objects, sorted by their sort_order.
|
|
1579
|
+
*/
|
|
1580
|
+
async findChunksByIds(library, version, ids) {
|
|
1581
|
+
if (!ids.length) return [];
|
|
1582
|
+
try {
|
|
1583
|
+
const normalizedVersion = version.toLowerCase();
|
|
1584
|
+
const placeholders = ids.map(() => "?").join(",");
|
|
1585
|
+
const stmt = this.db.prepare(
|
|
1586
|
+
`SELECT d.* FROM documents d
|
|
1587
|
+
JOIN libraries l ON d.library_id = l.id
|
|
1588
|
+
JOIN versions v ON d.version_id = v.id
|
|
1589
|
+
WHERE l.name = ?
|
|
1590
|
+
AND COALESCE(v.name, '') = COALESCE(?, '')
|
|
1591
|
+
AND d.id IN (${placeholders})
|
|
1592
|
+
ORDER BY d.sort_order`
|
|
1593
|
+
);
|
|
1594
|
+
const rows = stmt.all(
|
|
1595
|
+
library.toLowerCase(),
|
|
1596
|
+
normalizedVersion,
|
|
1597
|
+
...ids
|
|
1598
|
+
);
|
|
1599
|
+
return rows.map((row) => mapDbDocumentToDocument(row));
|
|
1600
|
+
} catch (error) {
|
|
1601
|
+
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
class DocumentManagementService {
|
|
1606
|
+
store;
|
|
1607
|
+
documentRetriever;
|
|
1608
|
+
splitter;
|
|
1609
|
+
/**
|
|
1610
|
+
* Normalizes a version string, converting null or undefined to an empty string
|
|
1611
|
+
* and converting to lowercase.
|
|
1612
|
+
*/
|
|
1613
|
+
normalizeVersion(version) {
|
|
1614
|
+
return (version ?? "").toLowerCase();
|
|
1615
|
+
}
|
|
1616
|
+
constructor() {
|
|
1617
|
+
let dbPath;
|
|
1618
|
+
let dbDir;
|
|
1619
|
+
const envStorePath = process.env.DOCS_MCP_STORE_PATH;
|
|
1620
|
+
if (envStorePath) {
|
|
1621
|
+
dbDir = envStorePath;
|
|
1622
|
+
dbPath = path.join(dbDir, "documents.db");
|
|
1623
|
+
logger.debug(`đž Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
|
|
1624
|
+
} else {
|
|
1625
|
+
const projectRoot = getProjectRoot();
|
|
1626
|
+
const oldDbDir = path.join(projectRoot, ".store");
|
|
1627
|
+
const oldDbPath = path.join(oldDbDir, "documents.db");
|
|
1628
|
+
const oldDbExists = fs.existsSync(oldDbPath);
|
|
1629
|
+
if (oldDbExists) {
|
|
1630
|
+
dbPath = oldDbPath;
|
|
1631
|
+
dbDir = oldDbDir;
|
|
1632
|
+
logger.debug(`đž Using legacy database path: ${dbPath}`);
|
|
1633
|
+
} else {
|
|
1634
|
+
const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
|
|
1635
|
+
dbDir = standardPaths.data;
|
|
1636
|
+
dbPath = path.join(dbDir, "documents.db");
|
|
1637
|
+
logger.debug(`đž Using standard database directory: ${dbDir}`);
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
try {
|
|
1641
|
+
fs.mkdirSync(dbDir, { recursive: true });
|
|
1642
|
+
} catch (error) {
|
|
1643
|
+
logger.error(`â ī¸ Failed to create database directory ${dbDir}: ${error}`);
|
|
1644
|
+
}
|
|
1645
|
+
this.store = new DocumentStore(dbPath);
|
|
1646
|
+
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
1647
|
+
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
1648
|
+
SPLITTER_PREFERRED_CHUNK_SIZE,
|
|
1649
|
+
SPLITTER_MAX_CHUNK_SIZE
|
|
1650
|
+
);
|
|
1651
|
+
const greedySplitter = new GreedySplitter(
|
|
1652
|
+
semanticSplitter,
|
|
1653
|
+
SPLITTER_MIN_CHUNK_SIZE,
|
|
1654
|
+
SPLITTER_PREFERRED_CHUNK_SIZE
|
|
1655
|
+
);
|
|
1656
|
+
this.splitter = greedySplitter;
|
|
1657
|
+
}
|
|
1658
|
+
/**
|
|
1659
|
+
* Initializes the underlying document store.
|
|
1660
|
+
*/
|
|
1661
|
+
async initialize() {
|
|
1662
|
+
await this.store.initialize();
|
|
1663
|
+
}
|
|
1664
|
+
/**
|
|
1665
|
+
* Shuts down the underlying document store.
|
|
1666
|
+
*/
|
|
1667
|
+
async shutdown() {
|
|
1668
|
+
logger.debug("Shutting down store manager");
|
|
1669
|
+
await this.store.shutdown();
|
|
1670
|
+
}
|
|
1671
|
+
// Status tracking methods for pipeline integration
|
|
1672
|
+
/**
|
|
1673
|
+
* Gets versions by their current status.
|
|
1674
|
+
*/
|
|
1675
|
+
async getVersionsByStatus(statuses) {
|
|
1676
|
+
return this.store.getVersionsByStatus(statuses);
|
|
1677
|
+
}
|
|
1678
|
+
/**
|
|
1679
|
+
* Updates the status of a version.
|
|
1680
|
+
*/
|
|
1681
|
+
async updateVersionStatus(versionId, status, errorMessage) {
|
|
1682
|
+
return this.store.updateVersionStatus(versionId, status, errorMessage);
|
|
1683
|
+
}
|
|
1684
|
+
/**
|
|
1685
|
+
* Updates the progress of a version being indexed.
|
|
1686
|
+
*/
|
|
1687
|
+
async updateVersionProgress(versionId, pages, maxPages) {
|
|
1688
|
+
return this.store.updateVersionProgress(versionId, pages, maxPages);
|
|
1689
|
+
}
|
|
1690
|
+
/**
|
|
1691
|
+
* Stores scraper options for a version to enable reproducible indexing.
|
|
1692
|
+
*/
|
|
1693
|
+
async storeScraperOptions(versionId, options) {
|
|
1694
|
+
return this.store.storeScraperOptions(versionId, options);
|
|
1695
|
+
}
|
|
1696
|
+
/**
|
|
1697
|
+
* Retrieves stored scraper options for a version.
|
|
1698
|
+
*/
|
|
1699
|
+
/**
|
|
1700
|
+
* Retrieves stored scraping configuration for a version.
|
|
1701
|
+
*/
|
|
1702
|
+
async getScraperOptions(versionId) {
|
|
1703
|
+
return this.store.getScraperOptions(versionId);
|
|
1704
|
+
}
|
|
1705
|
+
/**
|
|
1706
|
+
* Ensures a library/version exists using a VersionRef and returns version ID.
|
|
1707
|
+
* Delegates to existing ensureLibraryAndVersion for storage.
|
|
1708
|
+
*/
|
|
1709
|
+
async ensureVersion(ref) {
|
|
1710
|
+
const normalized = {
|
|
1711
|
+
library: ref.library.trim().toLowerCase(),
|
|
1712
|
+
version: (ref.version ?? "").trim().toLowerCase()
|
|
1713
|
+
};
|
|
1714
|
+
return this.ensureLibraryAndVersion(normalized.library, normalized.version);
|
|
1715
|
+
}
|
|
1716
|
+
/**
|
|
1717
|
+
* Returns enriched library summaries including version status/progress and counts.
|
|
1718
|
+
* Uses existing store APIs; keeps DB details encapsulated.
|
|
1719
|
+
*/
|
|
1720
|
+
async listLibraries() {
|
|
1721
|
+
const libMap = await this.store.queryLibraryVersions();
|
|
1722
|
+
const summaries = [];
|
|
1723
|
+
for (const [library, versions] of libMap) {
|
|
1724
|
+
const vs = versions.map(
|
|
1725
|
+
(v) => ({
|
|
1726
|
+
id: v.versionId,
|
|
1727
|
+
ref: { library, version: v.version },
|
|
1728
|
+
status: v.status,
|
|
1729
|
+
// Include progress only while indexing is active; set undefined for COMPLETED
|
|
1730
|
+
progress: v.status === "completed" ? void 0 : { pages: v.progressPages, maxPages: v.progressMaxPages },
|
|
1731
|
+
counts: { documents: v.documentCount, uniqueUrls: v.uniqueUrlCount },
|
|
1732
|
+
indexedAt: v.indexedAt,
|
|
1733
|
+
sourceUrl: v.sourceUrl ?? void 0
|
|
1734
|
+
})
|
|
1735
|
+
);
|
|
1736
|
+
summaries.push({ library, versions: vs });
|
|
1737
|
+
}
|
|
1738
|
+
return summaries;
|
|
1739
|
+
}
|
|
1740
|
+
/**
|
|
1741
|
+
* Finds versions that were indexed from the same source URL.
|
|
1742
|
+
*/
|
|
1743
|
+
async findVersionsBySourceUrl(url) {
|
|
1744
|
+
return this.store.findVersionsBySourceUrl(url);
|
|
1745
|
+
}
|
|
1746
|
+
/**
|
|
1747
|
+
* Validates if a library exists in the store (either versioned or unversioned).
|
|
1748
|
+
* Throws LibraryNotFoundError with suggestions if the library is not found.
|
|
1749
|
+
* @param library The name of the library to validate.
|
|
1750
|
+
* @throws {LibraryNotFoundError} If the library does not exist.
|
|
1751
|
+
*/
|
|
1752
|
+
async validateLibraryExists(library) {
|
|
1753
|
+
logger.info(`đ Validating existence of library: ${library}`);
|
|
1754
|
+
const normalizedLibrary = library.toLowerCase();
|
|
1755
|
+
const versions = await this.listVersions(normalizedLibrary);
|
|
1756
|
+
const hasUnversioned = await this.exists(normalizedLibrary, "");
|
|
1757
|
+
if (versions.length === 0 && !hasUnversioned) {
|
|
1758
|
+
logger.warn(`â ī¸ Library '${library}' not found.`);
|
|
1759
|
+
const allLibraries = await this.listLibraries();
|
|
1760
|
+
const libraryNames = allLibraries.map((lib) => lib.library);
|
|
1761
|
+
let suggestions = [];
|
|
1762
|
+
if (libraryNames.length > 0) {
|
|
1763
|
+
const fuse = new Fuse(libraryNames, {
|
|
1764
|
+
// Configure fuse.js options if needed (e.g., threshold)
|
|
1765
|
+
// isCaseSensitive: false, // Handled by normalizing library names
|
|
1766
|
+
// includeScore: true,
|
|
1767
|
+
threshold: 0.4
|
|
1768
|
+
// Adjust threshold for desired fuzziness (0=exact, 1=match anything)
|
|
1769
|
+
});
|
|
1770
|
+
const results = fuse.search(normalizedLibrary);
|
|
1771
|
+
suggestions = results.slice(0, 3).map((result) => result.item);
|
|
1772
|
+
logger.info(`đ Found suggestions: ${suggestions.join(", ")}`);
|
|
1773
|
+
}
|
|
1774
|
+
throw new LibraryNotFoundError(library, suggestions);
|
|
1775
|
+
}
|
|
1776
|
+
logger.info(`â
Library '${library}' confirmed to exist.`);
|
|
1777
|
+
}
|
|
1778
|
+
/**
|
|
1779
|
+
* Returns a list of all available semantic versions for a library.
|
|
1780
|
+
*/
|
|
1781
|
+
async listVersions(library) {
|
|
1782
|
+
const versions = await this.store.queryUniqueVersions(library);
|
|
1783
|
+
return versions.filter((v) => semver__default.valid(v));
|
|
1784
|
+
}
|
|
1785
|
+
/**
|
|
1786
|
+
* Checks if documents exist for a given library and optional version.
|
|
1787
|
+
* If version is omitted, checks for documents without a specific version.
|
|
1788
|
+
*/
|
|
1789
|
+
async exists(library, version) {
|
|
1790
|
+
const normalizedVersion = this.normalizeVersion(version);
|
|
1791
|
+
return this.store.checkDocumentExists(library, normalizedVersion);
|
|
1792
|
+
}
|
|
1793
|
+
/**
|
|
1794
|
+
* Finds the most appropriate version of documentation based on the requested version.
|
|
1795
|
+
* When no target version is specified, returns the latest version.
|
|
1796
|
+
*
|
|
1797
|
+
* Version matching behavior:
|
|
1798
|
+
* - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
|
|
1799
|
+
* - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
|
|
1800
|
+
* - "latest" or no version: Returns the latest available version
|
|
1801
|
+
*
|
|
1802
|
+
* For documentation, we prefer matching older versions over no match at all,
|
|
1803
|
+
* since older docs are often still relevant and useful.
|
|
1804
|
+
* Also checks if unversioned documents exist for the library.
|
|
1805
|
+
*/
|
|
1806
|
+
async findBestVersion(library, targetVersion) {
|
|
1807
|
+
const libraryAndVersion = `${library}${targetVersion ? `@${targetVersion}` : ""}`;
|
|
1808
|
+
logger.info(`đ Finding best version for ${libraryAndVersion}`);
|
|
1809
|
+
const hasUnversioned = await this.store.checkDocumentExists(library, "");
|
|
1810
|
+
const versionStrings = await this.listVersions(library);
|
|
1811
|
+
if (versionStrings.length === 0) {
|
|
1812
|
+
if (hasUnversioned) {
|
|
1813
|
+
logger.info(`âšī¸ Unversioned documents exist for ${library}`);
|
|
1814
|
+
return { bestMatch: null, hasUnversioned: true };
|
|
1815
|
+
}
|
|
1816
|
+
logger.warn(`â ī¸ No valid versions found for ${library}`);
|
|
1817
|
+
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
1818
|
+
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
1819
|
+
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
1820
|
+
}
|
|
1821
|
+
let bestMatch = null;
|
|
1822
|
+
if (!targetVersion || targetVersion === "latest") {
|
|
1823
|
+
bestMatch = semver__default.maxSatisfying(versionStrings, "*");
|
|
1824
|
+
} else {
|
|
1825
|
+
const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
|
|
1826
|
+
if (!versionRegex.test(targetVersion)) {
|
|
1827
|
+
logger.warn(`â ī¸ Invalid target version format: ${targetVersion}`);
|
|
1828
|
+
} else {
|
|
1829
|
+
let range = targetVersion;
|
|
1830
|
+
if (!semver__default.validRange(targetVersion)) {
|
|
1831
|
+
range = `~${targetVersion}`;
|
|
1832
|
+
} else if (semver__default.valid(targetVersion)) {
|
|
1833
|
+
range = `${range} || <=${targetVersion}`;
|
|
1834
|
+
}
|
|
1835
|
+
bestMatch = semver__default.maxSatisfying(versionStrings, range);
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
if (bestMatch) {
|
|
1839
|
+
logger.info(`â
Found best match version ${bestMatch} for ${libraryAndVersion}`);
|
|
1840
|
+
} else {
|
|
1841
|
+
logger.warn(`â ī¸ No matching semver version found for ${libraryAndVersion}`);
|
|
1842
|
+
}
|
|
1843
|
+
if (!bestMatch && !hasUnversioned) {
|
|
1844
|
+
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
1845
|
+
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
1846
|
+
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
1847
|
+
}
|
|
1848
|
+
return { bestMatch, hasUnversioned };
|
|
1849
|
+
}
|
|
1850
|
+
/**
|
|
1851
|
+
* Removes all documents for a specific library and optional version.
|
|
1852
|
+
* If version is omitted, removes documents without a specific version.
|
|
1853
|
+
*/
|
|
1854
|
+
async removeAllDocuments(library, version) {
|
|
1855
|
+
const normalizedVersion = this.normalizeVersion(version);
|
|
1856
|
+
logger.info(
|
|
1857
|
+
`đī¸ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
1858
|
+
);
|
|
1859
|
+
const count = await this.store.deleteDocuments(library, normalizedVersion);
|
|
1860
|
+
logger.info(`đ Deleted ${count} documents`);
|
|
1861
|
+
}
|
|
1862
|
+
/**
|
|
1863
|
+
* Adds a document to the store, splitting it into smaller chunks for better search results.
|
|
1864
|
+
* Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
|
|
1865
|
+
* Preserves hierarchical structure of documents and distinguishes between text and code segments.
|
|
1866
|
+
* If version is omitted, the document is added without a specific version.
|
|
1867
|
+
*/
|
|
1868
|
+
async addDocument(library, version, document) {
|
|
1869
|
+
const normalizedVersion = this.normalizeVersion(version);
|
|
1870
|
+
const url = document.metadata.url;
|
|
1871
|
+
if (!url || typeof url !== "string" || !url.trim()) {
|
|
1872
|
+
throw new StoreError("Document metadata must include a valid URL");
|
|
1873
|
+
}
|
|
1874
|
+
logger.info(`đ Adding document: ${document.metadata.title}`);
|
|
1875
|
+
if (!document.pageContent.trim()) {
|
|
1876
|
+
throw new Error("Document content cannot be empty");
|
|
1877
|
+
}
|
|
1878
|
+
const chunks = await this.splitter.splitText(document.pageContent);
|
|
1879
|
+
const splitDocs = chunks.map((chunk) => ({
|
|
1880
|
+
pageContent: chunk.content,
|
|
1881
|
+
metadata: {
|
|
1882
|
+
...document.metadata,
|
|
1883
|
+
level: chunk.section.level,
|
|
1884
|
+
path: chunk.section.path
|
|
1885
|
+
}
|
|
1886
|
+
}));
|
|
1887
|
+
logger.info(`âī¸ Split document into ${splitDocs.length} chunks`);
|
|
1888
|
+
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
1889
|
+
}
|
|
1890
|
+
/**
|
|
1891
|
+
* Searches for documentation content across versions.
|
|
1892
|
+
* Uses hybrid search (vector + FTS).
|
|
1893
|
+
* If version is omitted, searches documents without a specific version.
|
|
1894
|
+
*/
|
|
1895
|
+
async searchStore(library, version, query, limit = 5) {
|
|
1896
|
+
const normalizedVersion = this.normalizeVersion(version);
|
|
1897
|
+
return this.documentRetriever.search(library, normalizedVersion, query, limit);
|
|
1898
|
+
}
|
|
1899
|
+
// Deprecated simple listing removed: enriched listLibraries() is canonical
|
|
1900
|
+
/**
|
|
1901
|
+
* Ensures a library and version exist in the database and returns the version ID.
|
|
1902
|
+
* Creates the library and version records if they don't exist.
|
|
1903
|
+
*/
|
|
1904
|
+
async ensureLibraryAndVersion(library, version) {
|
|
1905
|
+
const normalizedLibrary = library.toLowerCase();
|
|
1906
|
+
const normalizedVersion = this.normalizeVersion(version);
|
|
1907
|
+
const { versionId } = await this.store.resolveLibraryAndVersionIds(
|
|
1908
|
+
normalizedLibrary,
|
|
1909
|
+
normalizedVersion
|
|
1910
|
+
);
|
|
1911
|
+
return versionId;
|
|
1912
|
+
}
|
|
1913
|
+
}
|
|
1914
|
+
export {
|
|
1915
|
+
DocumentManagementService
|
|
1916
|
+
};
|
|
1917
|
+
//# sourceMappingURL=DocumentManagementService-BH02TJEe.js.map
|