@arabold/docs-mcp-server 1.13.0 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -28
- package/dist/{EmbeddingFactory-0Z5e_g1J.js → EmbeddingFactory-C6_OpOiy.js} +2 -2
- package/dist/{EmbeddingFactory-0Z5e_g1J.js.map → EmbeddingFactory-C6_OpOiy.js.map} +1 -1
- package/dist/assets/main.css +1 -1
- package/dist/index.js +6047 -0
- package/dist/index.js.map +1 -0
- package/package.json +45 -45
- package/public/assets/main.css +1 -1
- package/dist/DocumentManagementService-BGW9iWNn.js +0 -3481
- package/dist/DocumentManagementService-BGW9iWNn.js.map +0 -1
- package/dist/FindVersionTool-DhhmoGU7.js +0 -140
- package/dist/FindVersionTool-DhhmoGU7.js.map +0 -1
- package/dist/RemoveTool-BZPTXvhj.js +0 -65
- package/dist/RemoveTool-BZPTXvhj.js.map +0 -1
- package/dist/cli.js +0 -236
- package/dist/cli.js.map +0 -1
- package/dist/server.js +0 -769
- package/dist/server.js.map +0 -1
- package/dist/web.js +0 -938
- package/dist/web.js.map +0 -1
|
@@ -1,3481 +0,0 @@
|
|
|
1
|
-
import { v4 } from "uuid";
|
|
2
|
-
import psl from "psl";
|
|
3
|
-
import axios from "axios";
|
|
4
|
-
import { HeaderGenerator } from "header-generator";
|
|
5
|
-
import fs from "node:fs/promises";
|
|
6
|
-
import path from "node:path";
|
|
7
|
-
import * as cheerio from "cheerio";
|
|
8
|
-
import "node:vm";
|
|
9
|
-
import { VirtualConsole, JSDOM } from "jsdom";
|
|
10
|
-
import { chromium } from "playwright";
|
|
11
|
-
import { gfm } from "@joplin/turndown-plugin-gfm";
|
|
12
|
-
import TurndownService from "turndown";
|
|
13
|
-
import { TextDecoder } from "node:util";
|
|
14
|
-
import { URL as URL$1, fileURLToPath } from "node:url";
|
|
15
|
-
import * as semver from "semver";
|
|
16
|
-
import semver__default from "semver";
|
|
17
|
-
import fs$1 from "node:fs";
|
|
18
|
-
import envPaths from "env-paths";
|
|
19
|
-
import Fuse from "fuse.js";
|
|
20
|
-
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
21
|
-
import remarkGfm from "remark-gfm";
|
|
22
|
-
import remarkHtml from "remark-html";
|
|
23
|
-
import remarkParse from "remark-parse";
|
|
24
|
-
import { unified } from "unified";
|
|
25
|
-
import Database from "better-sqlite3";
|
|
26
|
-
import * as sqliteVec from "sqlite-vec";
|
|
27
|
-
var LogLevel = /* @__PURE__ */ ((LogLevel2) => {
|
|
28
|
-
LogLevel2[LogLevel2["ERROR"] = 0] = "ERROR";
|
|
29
|
-
LogLevel2[LogLevel2["WARN"] = 1] = "WARN";
|
|
30
|
-
LogLevel2[LogLevel2["INFO"] = 2] = "INFO";
|
|
31
|
-
LogLevel2[LogLevel2["DEBUG"] = 3] = "DEBUG";
|
|
32
|
-
return LogLevel2;
|
|
33
|
-
})(LogLevel || {});
|
|
34
|
-
let currentLogLevel = 2;
|
|
35
|
-
function setLogLevel(level) {
|
|
36
|
-
currentLogLevel = level;
|
|
37
|
-
}
|
|
38
|
-
const logger = {
|
|
39
|
-
/**
|
|
40
|
-
* Logs a debug message if the current log level is DEBUG or higher.
|
|
41
|
-
* @param message - The message to log.
|
|
42
|
-
*/
|
|
43
|
-
debug: (message) => {
|
|
44
|
-
if (currentLogLevel >= 3) {
|
|
45
|
-
console.debug(message);
|
|
46
|
-
}
|
|
47
|
-
},
|
|
48
|
-
/**
|
|
49
|
-
* Logs an info message if the current log level is INFO or higher.
|
|
50
|
-
* @param message - The message to log.
|
|
51
|
-
*/
|
|
52
|
-
info: (message) => {
|
|
53
|
-
if (currentLogLevel >= 2) {
|
|
54
|
-
console.log(message);
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
/**
|
|
58
|
-
* Logs a warning message if the current log level is WARN or higher.
|
|
59
|
-
* @param message - The message to log.
|
|
60
|
-
*/
|
|
61
|
-
warn: (message) => {
|
|
62
|
-
if (currentLogLevel >= 1) {
|
|
63
|
-
console.warn(message);
|
|
64
|
-
}
|
|
65
|
-
},
|
|
66
|
-
/**
|
|
67
|
-
* Logs an error message if the current log level is ERROR or higher (always logs).
|
|
68
|
-
* @param message - The message to log.
|
|
69
|
-
*/
|
|
70
|
-
error: (message) => {
|
|
71
|
-
if (currentLogLevel >= 0) {
|
|
72
|
-
console.error(message);
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
};
|
|
76
|
-
class ScraperError extends Error {
|
|
77
|
-
constructor(message, isRetryable = false, cause) {
|
|
78
|
-
super(message);
|
|
79
|
-
this.isRetryable = isRetryable;
|
|
80
|
-
this.cause = cause;
|
|
81
|
-
this.name = this.constructor.name;
|
|
82
|
-
if (cause?.stack) {
|
|
83
|
-
this.stack = `${this.stack}
|
|
84
|
-
Caused by: ${cause.stack}`;
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
class InvalidUrlError extends ScraperError {
|
|
89
|
-
constructor(url, cause) {
|
|
90
|
-
super(`Invalid URL: ${url}`, false, cause);
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
class RedirectError extends ScraperError {
|
|
94
|
-
constructor(originalUrl, redirectUrl, statusCode) {
|
|
95
|
-
super(
|
|
96
|
-
`Redirect detected from ${originalUrl} to ${redirectUrl} (status: ${statusCode})`,
|
|
97
|
-
false
|
|
98
|
-
);
|
|
99
|
-
this.originalUrl = originalUrl;
|
|
100
|
-
this.redirectUrl = redirectUrl;
|
|
101
|
-
this.statusCode = statusCode;
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
const defaultNormalizerOptions = {
|
|
105
|
-
ignoreCase: true,
|
|
106
|
-
removeHash: true,
|
|
107
|
-
removeTrailingSlash: true,
|
|
108
|
-
removeQuery: false,
|
|
109
|
-
removeIndex: true
|
|
110
|
-
};
|
|
111
|
-
function normalizeUrl(url, options = defaultNormalizerOptions) {
|
|
112
|
-
try {
|
|
113
|
-
const parsedUrl = new URL(url);
|
|
114
|
-
const finalOptions = { ...defaultNormalizerOptions, ...options };
|
|
115
|
-
const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
|
|
116
|
-
if (finalOptions.removeIndex) {
|
|
117
|
-
normalized.pathname = normalized.pathname.replace(
|
|
118
|
-
/\/index\.(html|htm|asp|php|jsp)$/i,
|
|
119
|
-
"/"
|
|
120
|
-
);
|
|
121
|
-
}
|
|
122
|
-
if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
|
|
123
|
-
normalized.pathname = normalized.pathname.replace(/\/+$/, "");
|
|
124
|
-
}
|
|
125
|
-
const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
|
|
126
|
-
const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
|
|
127
|
-
let result = normalized.origin + normalized.pathname;
|
|
128
|
-
if (preservedSearch) {
|
|
129
|
-
result += preservedSearch;
|
|
130
|
-
}
|
|
131
|
-
if (preservedHash) {
|
|
132
|
-
result += preservedHash;
|
|
133
|
-
}
|
|
134
|
-
if (finalOptions.ignoreCase) {
|
|
135
|
-
result = result.toLowerCase();
|
|
136
|
-
}
|
|
137
|
-
return result;
|
|
138
|
-
} catch {
|
|
139
|
-
return url;
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
function validateUrl(url) {
|
|
143
|
-
try {
|
|
144
|
-
new URL(url);
|
|
145
|
-
} catch (error) {
|
|
146
|
-
throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
function hasSameHostname(urlA, urlB) {
|
|
150
|
-
return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
|
|
151
|
-
}
|
|
152
|
-
function hasSameDomain(urlA, urlB) {
|
|
153
|
-
const domainA = psl.get(urlA.hostname.toLowerCase());
|
|
154
|
-
const domainB = psl.get(urlB.hostname.toLowerCase());
|
|
155
|
-
return domainA !== null && domainA === domainB;
|
|
156
|
-
}
|
|
157
|
-
function isSubpath(baseUrl, targetUrl) {
|
|
158
|
-
const basePath = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : `${baseUrl.pathname}/`;
|
|
159
|
-
return targetUrl.pathname.startsWith(basePath);
|
|
160
|
-
}
|
|
161
|
-
const DEFAULT_MAX_PAGES$1 = 1e3;
|
|
162
|
-
const DEFAULT_MAX_DEPTH$1 = 3;
|
|
163
|
-
const DEFAULT_MAX_CONCURRENCY = 3;
|
|
164
|
-
const DEFAULT_PROTOCOL = "stdio";
|
|
165
|
-
const DEFAULT_HTTP_PORT = 6280;
|
|
166
|
-
const DEFAULT_WEB_PORT = 6281;
|
|
167
|
-
const FETCHER_MAX_RETRIES = 6;
|
|
168
|
-
const FETCHER_BASE_DELAY = 1e3;
|
|
169
|
-
const SPLITTER_MIN_CHUNK_SIZE = 500;
|
|
170
|
-
const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
|
|
171
|
-
const SPLITTER_MAX_CHUNK_SIZE = 5e3;
|
|
172
|
-
const EMBEDDING_BATCH_SIZE = 300;
|
|
173
|
-
class MimeTypeUtils {
|
|
174
|
-
/**
|
|
175
|
-
* Parses a Content-Type header string into its MIME type and charset.
|
|
176
|
-
* @param contentTypeHeader The Content-Type header string (e.g., "text/html; charset=utf-8").
|
|
177
|
-
* @returns A ParsedContentType object, or a default if parsing fails.
|
|
178
|
-
*/
|
|
179
|
-
static parseContentType(contentTypeHeader) {
|
|
180
|
-
if (!contentTypeHeader) {
|
|
181
|
-
return { mimeType: "application/octet-stream" };
|
|
182
|
-
}
|
|
183
|
-
const parts = contentTypeHeader.split(";").map((part) => part.trim());
|
|
184
|
-
const mimeType = parts[0].toLowerCase();
|
|
185
|
-
let charset;
|
|
186
|
-
for (let i = 1; i < parts.length; i++) {
|
|
187
|
-
const param = parts[i];
|
|
188
|
-
if (param.toLowerCase().startsWith("charset=")) {
|
|
189
|
-
charset = param.substring("charset=".length).toLowerCase();
|
|
190
|
-
break;
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
return { mimeType, charset };
|
|
194
|
-
}
|
|
195
|
-
/**
|
|
196
|
-
* Checks if a MIME type represents HTML content.
|
|
197
|
-
*/
|
|
198
|
-
static isHtml(mimeType) {
|
|
199
|
-
return mimeType === "text/html" || mimeType === "application/xhtml+xml";
|
|
200
|
-
}
|
|
201
|
-
/**
|
|
202
|
-
* Checks if a MIME type represents Markdown content.
|
|
203
|
-
*/
|
|
204
|
-
static isMarkdown(mimeType) {
|
|
205
|
-
return mimeType === "text/markdown" || mimeType === "text/x-markdown";
|
|
206
|
-
}
|
|
207
|
-
/**
|
|
208
|
-
* Checks if a MIME type represents plain text content.
|
|
209
|
-
*/
|
|
210
|
-
static isText(mimeType) {
|
|
211
|
-
return mimeType.startsWith("text/");
|
|
212
|
-
}
|
|
213
|
-
// Extend with more helpers as needed (isJson, isXml, isPdf, etc.)
|
|
214
|
-
}
|
|
215
|
-
class FingerprintGenerator {
|
|
216
|
-
headerGenerator;
|
|
217
|
-
/**
|
|
218
|
-
* Creates an instance of FingerprintGenerator.
|
|
219
|
-
* @param options Optional configuration for the header generator.
|
|
220
|
-
*/
|
|
221
|
-
constructor(options) {
|
|
222
|
-
const defaultOptions = {
|
|
223
|
-
browsers: [{ name: "chrome", minVersion: 100 }, "firefox", "safari"],
|
|
224
|
-
devices: ["desktop", "mobile"],
|
|
225
|
-
operatingSystems: ["windows", "linux", "macos", "android", "ios"],
|
|
226
|
-
locales: ["en-US", "en"],
|
|
227
|
-
httpVersion: "2"
|
|
228
|
-
};
|
|
229
|
-
this.headerGenerator = new HeaderGenerator({
|
|
230
|
-
...defaultOptions,
|
|
231
|
-
...options
|
|
232
|
-
});
|
|
233
|
-
}
|
|
234
|
-
/**
|
|
235
|
-
* Generates a set of realistic HTTP headers.
|
|
236
|
-
* @returns A set of realistic HTTP headers.
|
|
237
|
-
*/
|
|
238
|
-
generateHeaders() {
|
|
239
|
-
return this.headerGenerator.getHeaders();
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
class HttpFetcher {
|
|
243
|
-
retryableStatusCodes = [
|
|
244
|
-
408,
|
|
245
|
-
// Request Timeout
|
|
246
|
-
429,
|
|
247
|
-
// Too Many Requests
|
|
248
|
-
500,
|
|
249
|
-
// Internal Server Error
|
|
250
|
-
502,
|
|
251
|
-
// Bad Gateway
|
|
252
|
-
503,
|
|
253
|
-
// Service Unavailable
|
|
254
|
-
504,
|
|
255
|
-
// Gateway Timeout
|
|
256
|
-
525
|
|
257
|
-
// SSL Handshake Failed (Cloudflare specific)
|
|
258
|
-
];
|
|
259
|
-
fingerprintGenerator;
|
|
260
|
-
constructor() {
|
|
261
|
-
this.fingerprintGenerator = new FingerprintGenerator();
|
|
262
|
-
}
|
|
263
|
-
canFetch(source) {
|
|
264
|
-
return source.startsWith("http://") || source.startsWith("https://");
|
|
265
|
-
}
|
|
266
|
-
async delay(ms) {
|
|
267
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
268
|
-
}
|
|
269
|
-
async fetch(source, options) {
|
|
270
|
-
const maxRetries = options?.maxRetries ?? FETCHER_MAX_RETRIES;
|
|
271
|
-
const baseDelay = options?.retryDelay ?? FETCHER_BASE_DELAY;
|
|
272
|
-
const followRedirects = options?.followRedirects ?? true;
|
|
273
|
-
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
274
|
-
try {
|
|
275
|
-
const fingerprint = this.fingerprintGenerator.generateHeaders();
|
|
276
|
-
const headers = {
|
|
277
|
-
...fingerprint,
|
|
278
|
-
...options?.headers
|
|
279
|
-
// User-provided headers override generated ones
|
|
280
|
-
};
|
|
281
|
-
const config = {
|
|
282
|
-
responseType: "arraybuffer",
|
|
283
|
-
// For handling both text and binary
|
|
284
|
-
headers,
|
|
285
|
-
timeout: options?.timeout,
|
|
286
|
-
signal: options?.signal,
|
|
287
|
-
// Pass signal to axios
|
|
288
|
-
// Axios follows redirects by default, we need to explicitly disable it if needed
|
|
289
|
-
maxRedirects: followRedirects ? 5 : 0
|
|
290
|
-
};
|
|
291
|
-
const response = await axios.get(source, config);
|
|
292
|
-
const contentTypeHeader = response.headers["content-type"];
|
|
293
|
-
const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
|
|
294
|
-
const contentEncoding = response.headers["content-encoding"];
|
|
295
|
-
return {
|
|
296
|
-
content: response.data,
|
|
297
|
-
mimeType,
|
|
298
|
-
charset,
|
|
299
|
-
encoding: contentEncoding,
|
|
300
|
-
source
|
|
301
|
-
};
|
|
302
|
-
} catch (error) {
|
|
303
|
-
const axiosError = error;
|
|
304
|
-
const status = axiosError.response?.status;
|
|
305
|
-
const code = axiosError.code;
|
|
306
|
-
if (!followRedirects && status && status >= 300 && status < 400) {
|
|
307
|
-
const location = axiosError.response?.headers?.location;
|
|
308
|
-
if (location) {
|
|
309
|
-
throw new RedirectError(source, location, status);
|
|
310
|
-
}
|
|
311
|
-
}
|
|
312
|
-
if (attempt < maxRetries && (status === void 0 || this.retryableStatusCodes.includes(status))) {
|
|
313
|
-
const delay = baseDelay * 2 ** attempt;
|
|
314
|
-
logger.warn(
|
|
315
|
-
`Attempt ${attempt + 1}/${maxRetries + 1} failed for ${source} (Status: ${status}, Code: ${code}). Retrying in ${delay}ms...`
|
|
316
|
-
);
|
|
317
|
-
await this.delay(delay);
|
|
318
|
-
continue;
|
|
319
|
-
}
|
|
320
|
-
throw new ScraperError(
|
|
321
|
-
`Failed to fetch ${source} after ${attempt + 1} attempts: ${axiosError.message ?? "Unknown error"}`,
|
|
322
|
-
true,
|
|
323
|
-
error instanceof Error ? error : void 0
|
|
324
|
-
);
|
|
325
|
-
}
|
|
326
|
-
}
|
|
327
|
-
throw new ScraperError(
|
|
328
|
-
`Failed to fetch ${source} after ${maxRetries + 1} attempts`,
|
|
329
|
-
true
|
|
330
|
-
);
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
class FileFetcher {
|
|
334
|
-
canFetch(source) {
|
|
335
|
-
return source.startsWith("file://");
|
|
336
|
-
}
|
|
337
|
-
async fetch(source, options) {
|
|
338
|
-
const filePath = source.replace(/^file:\/\//, "");
|
|
339
|
-
logger.info(`Fetching file: ${filePath}`);
|
|
340
|
-
try {
|
|
341
|
-
const content = await fs.readFile(filePath);
|
|
342
|
-
const ext = path.extname(filePath).toLowerCase();
|
|
343
|
-
const mimeType = this.getMimeType(ext);
|
|
344
|
-
return {
|
|
345
|
-
content,
|
|
346
|
-
mimeType,
|
|
347
|
-
source,
|
|
348
|
-
encoding: "utf-8"
|
|
349
|
-
// Assume UTF-8 for text files
|
|
350
|
-
};
|
|
351
|
-
} catch (error) {
|
|
352
|
-
throw new ScraperError(
|
|
353
|
-
`Failed to read file ${filePath}: ${error.message ?? "Unknown error"}`,
|
|
354
|
-
false,
|
|
355
|
-
error instanceof Error ? error : void 0
|
|
356
|
-
);
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
getMimeType(ext) {
|
|
360
|
-
switch (ext) {
|
|
361
|
-
case ".html":
|
|
362
|
-
case ".htm":
|
|
363
|
-
return "text/html";
|
|
364
|
-
case ".md":
|
|
365
|
-
return "text/markdown";
|
|
366
|
-
case ".txt":
|
|
367
|
-
return "text/plain";
|
|
368
|
-
default:
|
|
369
|
-
return "application/octet-stream";
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
}
|
|
373
|
-
class HtmlCheerioParserMiddleware {
|
|
374
|
-
async process(context, next) {
|
|
375
|
-
try {
|
|
376
|
-
logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
|
|
377
|
-
const $ = cheerio.load(context.content);
|
|
378
|
-
context.dom = $;
|
|
379
|
-
await next();
|
|
380
|
-
} catch (error) {
|
|
381
|
-
logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
|
|
382
|
-
context.errors.push(
|
|
383
|
-
error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
|
|
384
|
-
);
|
|
385
|
-
return;
|
|
386
|
-
}
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
function createJSDOM(html, options) {
|
|
390
|
-
const virtualConsole = new VirtualConsole();
|
|
391
|
-
virtualConsole.on("error", () => {
|
|
392
|
-
});
|
|
393
|
-
virtualConsole.on("warn", () => {
|
|
394
|
-
});
|
|
395
|
-
virtualConsole.on("info", () => {
|
|
396
|
-
});
|
|
397
|
-
virtualConsole.on("debug", () => {
|
|
398
|
-
});
|
|
399
|
-
virtualConsole.on("log", () => {
|
|
400
|
-
});
|
|
401
|
-
const defaultOptions = {
|
|
402
|
-
virtualConsole
|
|
403
|
-
};
|
|
404
|
-
const finalOptions = { ...defaultOptions, ...options };
|
|
405
|
-
return new JSDOM(html, finalOptions);
|
|
406
|
-
}
|
|
407
|
-
class HtmlLinkExtractorMiddleware {
|
|
408
|
-
/**
|
|
409
|
-
* Processes the context to extract links from the sanitized HTML body.
|
|
410
|
-
* @param context The current middleware context.
|
|
411
|
-
* @param next Function to call the next middleware.
|
|
412
|
-
*/
|
|
413
|
-
async process(context, next) {
|
|
414
|
-
const $ = context.dom;
|
|
415
|
-
if (!$) {
|
|
416
|
-
logger.warn(
|
|
417
|
-
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
418
|
-
);
|
|
419
|
-
await next();
|
|
420
|
-
return;
|
|
421
|
-
}
|
|
422
|
-
try {
|
|
423
|
-
const linkElements = $("a[href]");
|
|
424
|
-
logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
|
|
425
|
-
const extractedLinks = [];
|
|
426
|
-
linkElements.each((index, element) => {
|
|
427
|
-
const href = $(element).attr("href");
|
|
428
|
-
if (href && href.trim() !== "") {
|
|
429
|
-
try {
|
|
430
|
-
const urlObj = new URL(href, context.source);
|
|
431
|
-
if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
|
|
432
|
-
logger.debug(`Ignoring link with invalid protocol: ${href}`);
|
|
433
|
-
return;
|
|
434
|
-
}
|
|
435
|
-
extractedLinks.push(urlObj.href);
|
|
436
|
-
} catch (e) {
|
|
437
|
-
logger.debug(`Ignoring invalid URL syntax: ${href}`);
|
|
438
|
-
}
|
|
439
|
-
}
|
|
440
|
-
});
|
|
441
|
-
context.links = [...new Set(extractedLinks)];
|
|
442
|
-
logger.debug(
|
|
443
|
-
`Extracted ${context.links.length} unique, valid links from ${context.source}`
|
|
444
|
-
);
|
|
445
|
-
} catch (error) {
|
|
446
|
-
logger.error(`Error extracting links from ${context.source}: ${error}`);
|
|
447
|
-
context.errors.push(
|
|
448
|
-
new Error(
|
|
449
|
-
`Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
450
|
-
)
|
|
451
|
-
);
|
|
452
|
-
}
|
|
453
|
-
await next();
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
class HtmlMetadataExtractorMiddleware {
|
|
457
|
-
/**
|
|
458
|
-
* Processes the context to extract the HTML title.
|
|
459
|
-
* @param context The current processing context.
|
|
460
|
-
* @param next Function to call the next middleware.
|
|
461
|
-
*/
|
|
462
|
-
async process(context, next) {
|
|
463
|
-
const $ = context.dom;
|
|
464
|
-
if (!$) {
|
|
465
|
-
logger.warn(
|
|
466
|
-
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
467
|
-
);
|
|
468
|
-
await next();
|
|
469
|
-
return;
|
|
470
|
-
}
|
|
471
|
-
try {
|
|
472
|
-
let title = $("title").first().text().trim();
|
|
473
|
-
if (!title) {
|
|
474
|
-
title = $("h1").first().text().trim();
|
|
475
|
-
}
|
|
476
|
-
title = title || "Untitled";
|
|
477
|
-
title = title.replace(/\s+/g, " ").trim();
|
|
478
|
-
context.metadata.title = title;
|
|
479
|
-
logger.debug(`Extracted title: "${title}" from ${context.source}`);
|
|
480
|
-
} catch (error) {
|
|
481
|
-
logger.error(`Error extracting metadata from ${context.source}: ${error}`);
|
|
482
|
-
context.errors.push(
|
|
483
|
-
new Error(
|
|
484
|
-
`Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
|
|
485
|
-
)
|
|
486
|
-
);
|
|
487
|
-
}
|
|
488
|
-
await next();
|
|
489
|
-
}
|
|
490
|
-
}
|
|
491
|
-
var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
|
|
492
|
-
ScrapeMode2["Fetch"] = "fetch";
|
|
493
|
-
ScrapeMode2["Playwright"] = "playwright";
|
|
494
|
-
ScrapeMode2["Auto"] = "auto";
|
|
495
|
-
return ScrapeMode2;
|
|
496
|
-
})(ScrapeMode || {});
|
|
497
|
-
class HtmlPlaywrightMiddleware {
|
|
498
|
-
browser = null;
|
|
499
|
-
/**
|
|
500
|
-
* Initializes the Playwright browser instance.
|
|
501
|
-
* Consider making this more robust (e.g., lazy initialization, singleton).
|
|
502
|
-
*/
|
|
503
|
-
async ensureBrowser() {
|
|
504
|
-
if (!this.browser || !this.browser.isConnected()) {
|
|
505
|
-
const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
|
|
506
|
-
logger.debug(
|
|
507
|
-
`Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
|
|
508
|
-
);
|
|
509
|
-
this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
|
|
510
|
-
this.browser.on("disconnected", () => {
|
|
511
|
-
logger.debug("Playwright browser instance disconnected.");
|
|
512
|
-
this.browser = null;
|
|
513
|
-
});
|
|
514
|
-
}
|
|
515
|
-
return this.browser;
|
|
516
|
-
}
|
|
517
|
-
/**
|
|
518
|
-
* Closes the Playwright browser instance if it exists.
|
|
519
|
-
* Should be called during application shutdown.
|
|
520
|
-
*/
|
|
521
|
-
async closeBrowser() {
|
|
522
|
-
if (this.browser?.isConnected()) {
|
|
523
|
-
logger.debug("Closing Playwright browser instance...");
|
|
524
|
-
await this.browser.close();
|
|
525
|
-
this.browser = null;
|
|
526
|
-
}
|
|
527
|
-
}
|
|
528
|
-
async process(context, next) {
|
|
529
|
-
const scrapeMode = context.options?.scrapeMode ?? ScrapeMode.Auto;
|
|
530
|
-
const shouldRunPlaywright = scrapeMode === ScrapeMode.Playwright || scrapeMode === ScrapeMode.Auto;
|
|
531
|
-
if (!shouldRunPlaywright) {
|
|
532
|
-
logger.debug(
|
|
533
|
-
`Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
|
|
534
|
-
);
|
|
535
|
-
await next();
|
|
536
|
-
return;
|
|
537
|
-
}
|
|
538
|
-
logger.debug(
|
|
539
|
-
`Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
|
|
540
|
-
);
|
|
541
|
-
let page = null;
|
|
542
|
-
let renderedHtml = null;
|
|
543
|
-
try {
|
|
544
|
-
const browser = await this.ensureBrowser();
|
|
545
|
-
page = await browser.newPage();
|
|
546
|
-
logger.debug(`Playwright: Processing ${context.source}`);
|
|
547
|
-
await page.route("**/*", (route) => {
|
|
548
|
-
if (route.request().url() === context.source) {
|
|
549
|
-
return route.fulfill({
|
|
550
|
-
status: 200,
|
|
551
|
-
contentType: "text/html",
|
|
552
|
-
body: context.content
|
|
553
|
-
});
|
|
554
|
-
}
|
|
555
|
-
const resourceType = route.request().resourceType();
|
|
556
|
-
if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
|
|
557
|
-
return route.abort();
|
|
558
|
-
}
|
|
559
|
-
return route.continue();
|
|
560
|
-
});
|
|
561
|
-
await page.goto(context.source, {
|
|
562
|
-
waitUntil: "load"
|
|
563
|
-
});
|
|
564
|
-
renderedHtml = await page.content();
|
|
565
|
-
logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
|
|
566
|
-
} catch (error) {
|
|
567
|
-
logger.error(`Playwright failed to render ${context.source}: ${error}`);
|
|
568
|
-
context.errors.push(
|
|
569
|
-
error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
|
|
570
|
-
);
|
|
571
|
-
} finally {
|
|
572
|
-
if (page) {
|
|
573
|
-
await page.unroute("**/*");
|
|
574
|
-
await page.close();
|
|
575
|
-
}
|
|
576
|
-
}
|
|
577
|
-
if (renderedHtml !== null) {
|
|
578
|
-
context.content = renderedHtml;
|
|
579
|
-
logger.debug(
|
|
580
|
-
`Playwright middleware updated content for ${context.source}. Proceeding.`
|
|
581
|
-
);
|
|
582
|
-
} else {
|
|
583
|
-
logger.warn(
|
|
584
|
-
`Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
|
|
585
|
-
);
|
|
586
|
-
}
|
|
587
|
-
await next();
|
|
588
|
-
}
|
|
589
|
-
}
|
|
590
|
-
class HtmlSanitizerMiddleware {
|
|
591
|
-
// Default selectors to remove
|
|
592
|
-
defaultSelectorsToRemove = [
|
|
593
|
-
"nav",
|
|
594
|
-
"footer",
|
|
595
|
-
"script",
|
|
596
|
-
"style",
|
|
597
|
-
"noscript",
|
|
598
|
-
"svg",
|
|
599
|
-
"link",
|
|
600
|
-
"meta",
|
|
601
|
-
"iframe",
|
|
602
|
-
"header",
|
|
603
|
-
"button",
|
|
604
|
-
"input",
|
|
605
|
-
"textarea",
|
|
606
|
-
"select",
|
|
607
|
-
// "form", // Keep commented
|
|
608
|
-
".ads",
|
|
609
|
-
".advertisement",
|
|
610
|
-
".banner",
|
|
611
|
-
".cookie-banner",
|
|
612
|
-
".cookie-consent",
|
|
613
|
-
".hidden",
|
|
614
|
-
".hide",
|
|
615
|
-
".modal",
|
|
616
|
-
".nav-bar",
|
|
617
|
-
".overlay",
|
|
618
|
-
".popup",
|
|
619
|
-
".promo",
|
|
620
|
-
".mw-editsection",
|
|
621
|
-
".side-bar",
|
|
622
|
-
".social-share",
|
|
623
|
-
".sticky",
|
|
624
|
-
"#ads",
|
|
625
|
-
"#banner",
|
|
626
|
-
"#cookieBanner",
|
|
627
|
-
"#modal",
|
|
628
|
-
"#nav",
|
|
629
|
-
"#overlay",
|
|
630
|
-
"#popup",
|
|
631
|
-
"#sidebar",
|
|
632
|
-
"#socialMediaBox",
|
|
633
|
-
"#stickyHeader",
|
|
634
|
-
"#ad-container",
|
|
635
|
-
".ad-container",
|
|
636
|
-
".login-form",
|
|
637
|
-
".signup-form",
|
|
638
|
-
".tooltip",
|
|
639
|
-
".dropdown-menu",
|
|
640
|
-
// ".alert", // Keep commented
|
|
641
|
-
".breadcrumb",
|
|
642
|
-
".pagination",
|
|
643
|
-
// '[role="alert"]', // Keep commented
|
|
644
|
-
'[role="banner"]',
|
|
645
|
-
'[role="dialog"]',
|
|
646
|
-
'[role="alertdialog"]',
|
|
647
|
-
'[role="region"][aria-label*="skip" i]',
|
|
648
|
-
'[aria-modal="true"]',
|
|
649
|
-
".noprint"
|
|
650
|
-
];
|
|
651
|
-
async process(context, next) {
|
|
652
|
-
const $ = context.dom;
|
|
653
|
-
if (!$) {
|
|
654
|
-
logger.warn(
|
|
655
|
-
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
|
|
656
|
-
);
|
|
657
|
-
await next();
|
|
658
|
-
return;
|
|
659
|
-
}
|
|
660
|
-
try {
|
|
661
|
-
const selectorsToRemove = [
|
|
662
|
-
...context.options.excludeSelectors || [],
|
|
663
|
-
// Use options from the context
|
|
664
|
-
...this.defaultSelectorsToRemove
|
|
665
|
-
];
|
|
666
|
-
logger.debug(
|
|
667
|
-
`Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
|
|
668
|
-
);
|
|
669
|
-
let removedCount = 0;
|
|
670
|
-
for (const selector of selectorsToRemove) {
|
|
671
|
-
try {
|
|
672
|
-
const elements = $(selector);
|
|
673
|
-
const count = elements.length;
|
|
674
|
-
if (count > 0) {
|
|
675
|
-
elements.remove();
|
|
676
|
-
removedCount += count;
|
|
677
|
-
}
|
|
678
|
-
} catch (selectorError) {
|
|
679
|
-
logger.warn(
|
|
680
|
-
`Potentially invalid selector "${selector}" during element removal: ${selectorError}`
|
|
681
|
-
);
|
|
682
|
-
context.errors.push(
|
|
683
|
-
new Error(`Invalid selector "${selector}": ${selectorError}`)
|
|
684
|
-
);
|
|
685
|
-
}
|
|
686
|
-
}
|
|
687
|
-
logger.debug(`Removed ${removedCount} elements for ${context.source}`);
|
|
688
|
-
} catch (error) {
|
|
689
|
-
logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
|
|
690
|
-
context.errors.push(
|
|
691
|
-
error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
|
|
692
|
-
);
|
|
693
|
-
}
|
|
694
|
-
await next();
|
|
695
|
-
}
|
|
696
|
-
}
|
|
697
|
-
class HtmlToMarkdownMiddleware {
|
|
698
|
-
turndownService;
|
|
699
|
-
constructor() {
|
|
700
|
-
this.turndownService = new TurndownService({
|
|
701
|
-
headingStyle: "atx",
|
|
702
|
-
hr: "---",
|
|
703
|
-
bulletListMarker: "-",
|
|
704
|
-
codeBlockStyle: "fenced",
|
|
705
|
-
emDelimiter: "_",
|
|
706
|
-
strongDelimiter: "**",
|
|
707
|
-
linkStyle: "inlined"
|
|
708
|
-
});
|
|
709
|
-
this.turndownService.use(gfm);
|
|
710
|
-
this.addCustomRules();
|
|
711
|
-
}
|
|
712
|
-
addCustomRules() {
|
|
713
|
-
this.turndownService.addRule("pre", {
|
|
714
|
-
filter: ["pre"],
|
|
715
|
-
replacement: (content, node) => {
|
|
716
|
-
const element = node;
|
|
717
|
-
let language = element.getAttribute("data-language") || "";
|
|
718
|
-
if (!language) {
|
|
719
|
-
const highlightElement = element.closest(
|
|
720
|
-
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
721
|
-
) || element.querySelector(
|
|
722
|
-
'[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
|
|
723
|
-
);
|
|
724
|
-
if (highlightElement) {
|
|
725
|
-
const className = highlightElement.className;
|
|
726
|
-
const match = className.match(
|
|
727
|
-
/(?:highlight-source-|highlight-|language-)(\w+)/
|
|
728
|
-
);
|
|
729
|
-
if (match) language = match[1];
|
|
730
|
-
}
|
|
731
|
-
}
|
|
732
|
-
const brElements = Array.from(element.querySelectorAll("br"));
|
|
733
|
-
for (const br of brElements) {
|
|
734
|
-
br.replaceWith("\n");
|
|
735
|
-
}
|
|
736
|
-
const text = element.textContent || "";
|
|
737
|
-
return `
|
|
738
|
-
\`\`\`${language}
|
|
739
|
-
${text.replace(/^\n+|\n+$/g, "")}
|
|
740
|
-
\`\`\`
|
|
741
|
-
`;
|
|
742
|
-
}
|
|
743
|
-
});
|
|
744
|
-
this.turndownService.addRule("anchor", {
|
|
745
|
-
filter: ["a"],
|
|
746
|
-
replacement: (content, node) => {
|
|
747
|
-
const href = node.getAttribute("href");
|
|
748
|
-
if (!content || content === "#") {
|
|
749
|
-
return "";
|
|
750
|
-
}
|
|
751
|
-
if (!href) {
|
|
752
|
-
return content;
|
|
753
|
-
}
|
|
754
|
-
return `[${content}](${href})`;
|
|
755
|
-
}
|
|
756
|
-
});
|
|
757
|
-
}
|
|
758
|
-
/**
|
|
759
|
-
* Processes the context to convert the sanitized HTML body node to Markdown.
|
|
760
|
-
* @param context The current processing context.
|
|
761
|
-
* @param next Function to call the next middleware.
|
|
762
|
-
*/
|
|
763
|
-
async process(context, next) {
|
|
764
|
-
const $ = context.dom;
|
|
765
|
-
if (!$) {
|
|
766
|
-
logger.warn(
|
|
767
|
-
`Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware ran correctly.`
|
|
768
|
-
);
|
|
769
|
-
await next();
|
|
770
|
-
return;
|
|
771
|
-
}
|
|
772
|
-
try {
|
|
773
|
-
logger.debug(`Converting HTML content to Markdown for ${context.source}`);
|
|
774
|
-
const htmlToConvert = $("body").html() || $.html();
|
|
775
|
-
const markdown = this.turndownService.turndown(htmlToConvert).trim();
|
|
776
|
-
if (!markdown) {
|
|
777
|
-
const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
|
|
778
|
-
logger.warn(warnMsg);
|
|
779
|
-
context.content = "";
|
|
780
|
-
} else {
|
|
781
|
-
context.content = markdown;
|
|
782
|
-
logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
|
|
783
|
-
}
|
|
784
|
-
} catch (error) {
|
|
785
|
-
logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
|
|
786
|
-
context.errors.push(
|
|
787
|
-
new Error(
|
|
788
|
-
`Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
789
|
-
)
|
|
790
|
-
);
|
|
791
|
-
}
|
|
792
|
-
await next();
|
|
793
|
-
}
|
|
794
|
-
}
|
|
795
|
-
class MarkdownLinkExtractorMiddleware {
|
|
796
|
-
/**
|
|
797
|
-
* Processes the context. Currently a no-op regarding link extraction.
|
|
798
|
-
* @param context The current processing context.
|
|
799
|
-
* @param next Function to call the next middleware.
|
|
800
|
-
*/
|
|
801
|
-
async process(context, next) {
|
|
802
|
-
if (!Array.isArray(context.links)) {
|
|
803
|
-
context.links = [];
|
|
804
|
-
}
|
|
805
|
-
await next();
|
|
806
|
-
}
|
|
807
|
-
}
|
|
808
|
-
class MarkdownMetadataExtractorMiddleware {
|
|
809
|
-
/**
|
|
810
|
-
* Processes the context to extract the title from Markdown.
|
|
811
|
-
* @param context The current processing context.
|
|
812
|
-
* @param next Function to call the next middleware.
|
|
813
|
-
*/
|
|
814
|
-
async process(context, next) {
|
|
815
|
-
try {
|
|
816
|
-
let title = "Untitled";
|
|
817
|
-
const match = context.content.match(/^#\s+(.*)$/m);
|
|
818
|
-
if (match?.[1]) {
|
|
819
|
-
title = match[1].trim();
|
|
820
|
-
}
|
|
821
|
-
context.metadata.title = title;
|
|
822
|
-
} catch (error) {
|
|
823
|
-
context.errors.push(
|
|
824
|
-
new Error(
|
|
825
|
-
`Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
|
|
826
|
-
)
|
|
827
|
-
);
|
|
828
|
-
}
|
|
829
|
-
await next();
|
|
830
|
-
}
|
|
831
|
-
}
|
|
832
|
-
function convertToString(content, charset) {
|
|
833
|
-
if (Buffer.isBuffer(content)) {
|
|
834
|
-
const decoder = new TextDecoder(charset || "utf-8");
|
|
835
|
-
return decoder.decode(content);
|
|
836
|
-
}
|
|
837
|
-
return content;
|
|
838
|
-
}
|
|
839
|
-
class BasePipeline {
|
|
840
|
-
/**
|
|
841
|
-
* Determines if this pipeline can process the given content.
|
|
842
|
-
* Must be implemented by derived classes.
|
|
843
|
-
*/
|
|
844
|
-
canProcess(_rawContent) {
|
|
845
|
-
throw new Error("Method not implemented.");
|
|
846
|
-
}
|
|
847
|
-
/**
|
|
848
|
-
* Processes the raw content through the pipeline.
|
|
849
|
-
* Must be implemented by derived classes.
|
|
850
|
-
*/
|
|
851
|
-
async process(_rawContent, _options, _fetcher) {
|
|
852
|
-
throw new Error("Method not implemented.");
|
|
853
|
-
}
|
|
854
|
-
/**
|
|
855
|
-
* Executes a middleware stack on the given context.
|
|
856
|
-
* This is a utility method used by derived pipeline classes.
|
|
857
|
-
*
|
|
858
|
-
* @param middleware - The middleware stack to execute
|
|
859
|
-
* @param context - The context to process
|
|
860
|
-
*/
|
|
861
|
-
async executeMiddlewareStack(middleware, context) {
|
|
862
|
-
let index = -1;
|
|
863
|
-
const dispatch = async (i) => {
|
|
864
|
-
if (i <= index) throw new Error("next() called multiple times");
|
|
865
|
-
index = i;
|
|
866
|
-
const mw = middleware[i];
|
|
867
|
-
if (!mw) return;
|
|
868
|
-
await mw.process(context, dispatch.bind(null, i + 1));
|
|
869
|
-
};
|
|
870
|
-
try {
|
|
871
|
-
await dispatch(0);
|
|
872
|
-
} catch (error) {
|
|
873
|
-
context.errors.push(error instanceof Error ? error : new Error(String(error)));
|
|
874
|
-
}
|
|
875
|
-
}
|
|
876
|
-
/**
|
|
877
|
-
* Cleans up resources when the pipeline is no longer needed.
|
|
878
|
-
* Default implementation does nothing.
|
|
879
|
-
*/
|
|
880
|
-
async close() {
|
|
881
|
-
}
|
|
882
|
-
}
|
|
883
|
-
class HtmlPipeline extends BasePipeline {
|
|
884
|
-
playwrightMiddleware;
|
|
885
|
-
standardMiddleware;
|
|
886
|
-
constructor() {
|
|
887
|
-
super();
|
|
888
|
-
this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
|
|
889
|
-
this.standardMiddleware = [
|
|
890
|
-
new HtmlCheerioParserMiddleware(),
|
|
891
|
-
new HtmlMetadataExtractorMiddleware(),
|
|
892
|
-
new HtmlLinkExtractorMiddleware(),
|
|
893
|
-
new HtmlSanitizerMiddleware(),
|
|
894
|
-
new HtmlToMarkdownMiddleware()
|
|
895
|
-
];
|
|
896
|
-
}
|
|
897
|
-
canProcess(rawContent) {
|
|
898
|
-
return MimeTypeUtils.isHtml(rawContent.mimeType);
|
|
899
|
-
}
|
|
900
|
-
async process(rawContent, options, fetcher) {
|
|
901
|
-
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
902
|
-
const context = {
|
|
903
|
-
content: contentString,
|
|
904
|
-
source: rawContent.source,
|
|
905
|
-
metadata: {},
|
|
906
|
-
links: [],
|
|
907
|
-
errors: [],
|
|
908
|
-
options,
|
|
909
|
-
fetcher
|
|
910
|
-
};
|
|
911
|
-
let middleware = [...this.standardMiddleware];
|
|
912
|
-
if (options.scrapeMode === "playwright" || options.scrapeMode === "auto") {
|
|
913
|
-
middleware = [this.playwrightMiddleware, ...middleware];
|
|
914
|
-
}
|
|
915
|
-
await this.executeMiddlewareStack(middleware, context);
|
|
916
|
-
return {
|
|
917
|
-
textContent: typeof context.content === "string" ? context.content : "",
|
|
918
|
-
metadata: context.metadata,
|
|
919
|
-
links: context.links,
|
|
920
|
-
errors: context.errors
|
|
921
|
-
};
|
|
922
|
-
}
|
|
923
|
-
async close() {
|
|
924
|
-
await this.playwrightMiddleware.closeBrowser();
|
|
925
|
-
}
|
|
926
|
-
}
|
|
927
|
-
class MarkdownPipeline extends BasePipeline {
|
|
928
|
-
middleware;
|
|
929
|
-
constructor() {
|
|
930
|
-
super();
|
|
931
|
-
this.middleware = [
|
|
932
|
-
new MarkdownMetadataExtractorMiddleware(),
|
|
933
|
-
new MarkdownLinkExtractorMiddleware()
|
|
934
|
-
];
|
|
935
|
-
}
|
|
936
|
-
canProcess(rawContent) {
|
|
937
|
-
if (!rawContent.mimeType) return false;
|
|
938
|
-
return MimeTypeUtils.isMarkdown(rawContent.mimeType) || MimeTypeUtils.isText(rawContent.mimeType);
|
|
939
|
-
}
|
|
940
|
-
async process(rawContent, options, fetcher) {
|
|
941
|
-
const contentString = convertToString(rawContent.content, rawContent.charset);
|
|
942
|
-
const context = {
|
|
943
|
-
content: contentString,
|
|
944
|
-
source: rawContent.source,
|
|
945
|
-
metadata: {},
|
|
946
|
-
links: [],
|
|
947
|
-
errors: [],
|
|
948
|
-
options,
|
|
949
|
-
fetcher
|
|
950
|
-
};
|
|
951
|
-
await this.executeMiddlewareStack(this.middleware, context);
|
|
952
|
-
return {
|
|
953
|
-
textContent: typeof context.content === "string" ? context.content : "",
|
|
954
|
-
metadata: context.metadata,
|
|
955
|
-
links: context.links,
|
|
956
|
-
errors: context.errors
|
|
957
|
-
};
|
|
958
|
-
}
|
|
959
|
-
async close() {
|
|
960
|
-
}
|
|
961
|
-
}
|
|
962
|
-
class PipelineError extends Error {
|
|
963
|
-
constructor(message, cause) {
|
|
964
|
-
super(message);
|
|
965
|
-
this.cause = cause;
|
|
966
|
-
this.name = this.constructor.name;
|
|
967
|
-
if (cause?.stack) {
|
|
968
|
-
this.stack = `${this.stack}
|
|
969
|
-
Caused by: ${cause.stack}`;
|
|
970
|
-
}
|
|
971
|
-
}
|
|
972
|
-
}
|
|
973
|
-
class PipelineStateError extends PipelineError {
|
|
974
|
-
}
|
|
975
|
-
class CancellationError extends PipelineError {
|
|
976
|
-
constructor(message = "Operation cancelled") {
|
|
977
|
-
super(message);
|
|
978
|
-
}
|
|
979
|
-
}
|
|
980
|
-
const DEFAULT_MAX_PAGES = 100;
|
|
981
|
-
const DEFAULT_MAX_DEPTH = 3;
|
|
982
|
-
const DEFAULT_CONCURRENCY$1 = 3;
|
|
983
|
-
class BaseScraperStrategy {
|
|
984
|
-
visited = /* @__PURE__ */ new Set();
|
|
985
|
-
pageCount = 0;
|
|
986
|
-
options;
|
|
987
|
-
constructor(options = {}) {
|
|
988
|
-
this.options = options;
|
|
989
|
-
}
|
|
990
|
-
// Removed getProcessor method as processing is now handled by strategies using middleware pipelines
|
|
991
|
-
async processBatch(batch, baseUrl, options, progressCallback, signal) {
|
|
992
|
-
const results = await Promise.all(
|
|
993
|
-
batch.map(async (item) => {
|
|
994
|
-
if (signal?.aborted) {
|
|
995
|
-
throw new CancellationError("Scraping cancelled during batch processing");
|
|
996
|
-
}
|
|
997
|
-
const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
|
|
998
|
-
if (item.depth > maxDepth) {
|
|
999
|
-
return [];
|
|
1000
|
-
}
|
|
1001
|
-
try {
|
|
1002
|
-
const result = await this.processItem(item, options, void 0, signal);
|
|
1003
|
-
if (result.document) {
|
|
1004
|
-
this.pageCount++;
|
|
1005
|
-
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
1006
|
-
logger.info(
|
|
1007
|
-
`🌐 Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
|
|
1008
|
-
);
|
|
1009
|
-
await progressCallback({
|
|
1010
|
-
pagesScraped: this.pageCount,
|
|
1011
|
-
maxPages,
|
|
1012
|
-
currentUrl: item.url,
|
|
1013
|
-
depth: item.depth,
|
|
1014
|
-
maxDepth,
|
|
1015
|
-
document: result.document
|
|
1016
|
-
});
|
|
1017
|
-
}
|
|
1018
|
-
const nextItems = result.links || [];
|
|
1019
|
-
return nextItems.map((value) => {
|
|
1020
|
-
try {
|
|
1021
|
-
const targetUrl = new URL$1(value, baseUrl);
|
|
1022
|
-
return {
|
|
1023
|
-
url: targetUrl.href,
|
|
1024
|
-
depth: item.depth + 1
|
|
1025
|
-
};
|
|
1026
|
-
} catch (error) {
|
|
1027
|
-
logger.warn(`❌ Invalid URL: ${value}`);
|
|
1028
|
-
}
|
|
1029
|
-
return null;
|
|
1030
|
-
}).filter((item2) => item2 !== null);
|
|
1031
|
-
} catch (error) {
|
|
1032
|
-
if (options.ignoreErrors) {
|
|
1033
|
-
logger.error(`❌ Failed to process ${item.url}: ${error}`);
|
|
1034
|
-
return [];
|
|
1035
|
-
}
|
|
1036
|
-
throw error;
|
|
1037
|
-
}
|
|
1038
|
-
})
|
|
1039
|
-
);
|
|
1040
|
-
const allLinks = results.flat();
|
|
1041
|
-
const uniqueLinks = [];
|
|
1042
|
-
for (const item of allLinks) {
|
|
1043
|
-
const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
|
|
1044
|
-
if (!this.visited.has(normalizedUrl)) {
|
|
1045
|
-
this.visited.add(normalizedUrl);
|
|
1046
|
-
uniqueLinks.push(item);
|
|
1047
|
-
}
|
|
1048
|
-
}
|
|
1049
|
-
return uniqueLinks;
|
|
1050
|
-
}
|
|
1051
|
-
async scrape(options, progressCallback, signal) {
|
|
1052
|
-
this.visited.clear();
|
|
1053
|
-
this.pageCount = 0;
|
|
1054
|
-
const baseUrl = new URL$1(options.url);
|
|
1055
|
-
const queue = [{ url: options.url, depth: 0 }];
|
|
1056
|
-
this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
|
|
1057
|
-
const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
|
|
1058
|
-
const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY$1;
|
|
1059
|
-
while (queue.length > 0 && this.pageCount < maxPages) {
|
|
1060
|
-
if (signal?.aborted) {
|
|
1061
|
-
logger.info("Scraping cancelled by signal.");
|
|
1062
|
-
throw new CancellationError("Scraping cancelled by signal");
|
|
1063
|
-
}
|
|
1064
|
-
const remainingPages = maxPages - this.pageCount;
|
|
1065
|
-
if (remainingPages <= 0) {
|
|
1066
|
-
break;
|
|
1067
|
-
}
|
|
1068
|
-
const batchSize = Math.min(
|
|
1069
|
-
maxConcurrency,
|
|
1070
|
-
// Use variable
|
|
1071
|
-
remainingPages,
|
|
1072
|
-
queue.length
|
|
1073
|
-
);
|
|
1074
|
-
const batch = queue.splice(0, batchSize);
|
|
1075
|
-
const newUrls = await this.processBatch(
|
|
1076
|
-
batch,
|
|
1077
|
-
baseUrl,
|
|
1078
|
-
options,
|
|
1079
|
-
progressCallback,
|
|
1080
|
-
signal
|
|
1081
|
-
);
|
|
1082
|
-
queue.push(...newUrls);
|
|
1083
|
-
}
|
|
1084
|
-
}
|
|
1085
|
-
}
|
|
1086
|
-
class WebScraperStrategy extends BaseScraperStrategy {
|
|
1087
|
-
httpFetcher = new HttpFetcher();
|
|
1088
|
-
shouldFollowLinkFn;
|
|
1089
|
-
htmlPipeline;
|
|
1090
|
-
markdownPipeline;
|
|
1091
|
-
pipelines;
|
|
1092
|
-
constructor(options = {}) {
|
|
1093
|
-
super({ urlNormalizerOptions: options.urlNormalizerOptions });
|
|
1094
|
-
this.shouldFollowLinkFn = options.shouldFollowLink;
|
|
1095
|
-
this.htmlPipeline = new HtmlPipeline();
|
|
1096
|
-
this.markdownPipeline = new MarkdownPipeline();
|
|
1097
|
-
this.pipelines = [this.htmlPipeline, this.markdownPipeline];
|
|
1098
|
-
}
|
|
1099
|
-
canHandle(url) {
|
|
1100
|
-
try {
|
|
1101
|
-
const parsedUrl = new URL(url);
|
|
1102
|
-
return parsedUrl.protocol === "http:" || parsedUrl.protocol === "https:";
|
|
1103
|
-
} catch {
|
|
1104
|
-
return false;
|
|
1105
|
-
}
|
|
1106
|
-
}
|
|
1107
|
-
/**
|
|
1108
|
-
* Determines if a target URL should be followed based on the scope setting.
|
|
1109
|
-
*/
|
|
1110
|
-
isInScope(baseUrl, targetUrl, scope) {
|
|
1111
|
-
try {
|
|
1112
|
-
if (scope === "domain") {
|
|
1113
|
-
return hasSameDomain(baseUrl, targetUrl);
|
|
1114
|
-
}
|
|
1115
|
-
if (scope === "hostname") {
|
|
1116
|
-
return hasSameHostname(baseUrl, targetUrl);
|
|
1117
|
-
}
|
|
1118
|
-
return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
|
|
1119
|
-
} catch {
|
|
1120
|
-
return false;
|
|
1121
|
-
}
|
|
1122
|
-
}
|
|
1123
|
-
async processItem(item, options, _progressCallback, signal) {
|
|
1124
|
-
const { url } = item;
|
|
1125
|
-
try {
|
|
1126
|
-
const fetchOptions = {
|
|
1127
|
-
signal,
|
|
1128
|
-
followRedirects: options.followRedirects
|
|
1129
|
-
};
|
|
1130
|
-
const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
|
|
1131
|
-
let processed;
|
|
1132
|
-
for (const pipeline of this.pipelines) {
|
|
1133
|
-
if (pipeline.canProcess(rawContent)) {
|
|
1134
|
-
processed = await pipeline.process(rawContent, options, this.httpFetcher);
|
|
1135
|
-
break;
|
|
1136
|
-
}
|
|
1137
|
-
}
|
|
1138
|
-
if (!processed) {
|
|
1139
|
-
logger.warn(
|
|
1140
|
-
`Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
|
|
1141
|
-
);
|
|
1142
|
-
return { document: void 0, links: [] };
|
|
1143
|
-
}
|
|
1144
|
-
for (const err of processed.errors) {
|
|
1145
|
-
logger.warn(`Processing error for ${url}: ${err.message}`);
|
|
1146
|
-
}
|
|
1147
|
-
if (!processed.textContent || !processed.textContent.trim()) {
|
|
1148
|
-
logger.warn(`No processable content found for ${url} after pipeline execution.`);
|
|
1149
|
-
return { document: void 0, links: processed.links };
|
|
1150
|
-
}
|
|
1151
|
-
const baseUrl = new URL(options.url);
|
|
1152
|
-
const filteredLinks = processed.links.filter((link) => {
|
|
1153
|
-
try {
|
|
1154
|
-
const targetUrl = new URL(link);
|
|
1155
|
-
const scope = options.scope || "subpages";
|
|
1156
|
-
return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
|
|
1157
|
-
} catch {
|
|
1158
|
-
return false;
|
|
1159
|
-
}
|
|
1160
|
-
});
|
|
1161
|
-
return {
|
|
1162
|
-
document: {
|
|
1163
|
-
content: processed.textContent,
|
|
1164
|
-
metadata: {
|
|
1165
|
-
url,
|
|
1166
|
-
title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
|
|
1167
|
-
library: options.library,
|
|
1168
|
-
version: options.version,
|
|
1169
|
-
...processed.metadata
|
|
1170
|
-
}
|
|
1171
|
-
},
|
|
1172
|
-
links: filteredLinks
|
|
1173
|
-
};
|
|
1174
|
-
} catch (error) {
|
|
1175
|
-
logger.error(`Failed processing page ${url}: ${error}`);
|
|
1176
|
-
throw error;
|
|
1177
|
-
}
|
|
1178
|
-
}
|
|
1179
|
-
/**
|
|
1180
|
-
* Overrides the base scrape method to ensure the Playwright browser is closed
|
|
1181
|
-
* after the scraping process completes or errors out.
|
|
1182
|
-
*/
|
|
1183
|
-
async scrape(options, progressCallback, signal) {
|
|
1184
|
-
try {
|
|
1185
|
-
await super.scrape(options, progressCallback, signal);
|
|
1186
|
-
} finally {
|
|
1187
|
-
await this.htmlPipeline.close();
|
|
1188
|
-
await this.markdownPipeline.close();
|
|
1189
|
-
}
|
|
1190
|
-
}
|
|
1191
|
-
}
|
|
1192
|
-
class GitHubScraperStrategy {
|
|
1193
|
-
defaultStrategy;
|
|
1194
|
-
canHandle(url) {
|
|
1195
|
-
const { hostname } = new URL(url);
|
|
1196
|
-
return ["github.com", "www.github.com"].includes(hostname);
|
|
1197
|
-
}
|
|
1198
|
-
constructor() {
|
|
1199
|
-
const shouldFollowLink = (baseUrl, targetUrl) => {
|
|
1200
|
-
if (this.getRepoPath(baseUrl) !== this.getRepoPath(targetUrl)) {
|
|
1201
|
-
return false;
|
|
1202
|
-
}
|
|
1203
|
-
const path2 = targetUrl.pathname;
|
|
1204
|
-
if (path2 === this.getRepoPath(targetUrl)) {
|
|
1205
|
-
return true;
|
|
1206
|
-
}
|
|
1207
|
-
if (path2.startsWith(`${this.getRepoPath(targetUrl)}/wiki`)) {
|
|
1208
|
-
return true;
|
|
1209
|
-
}
|
|
1210
|
-
if (path2.startsWith(`${this.getRepoPath(targetUrl)}/blob/`) && path2.endsWith(".md")) {
|
|
1211
|
-
return true;
|
|
1212
|
-
}
|
|
1213
|
-
return false;
|
|
1214
|
-
};
|
|
1215
|
-
this.defaultStrategy = new WebScraperStrategy({
|
|
1216
|
-
urlNormalizerOptions: {
|
|
1217
|
-
ignoreCase: true,
|
|
1218
|
-
removeHash: true,
|
|
1219
|
-
removeTrailingSlash: true,
|
|
1220
|
-
removeQuery: true
|
|
1221
|
-
// Remove query parameters like ?tab=readme-ov-file
|
|
1222
|
-
},
|
|
1223
|
-
shouldFollowLink
|
|
1224
|
-
});
|
|
1225
|
-
}
|
|
1226
|
-
getRepoPath(url) {
|
|
1227
|
-
const match = url.pathname.match(/^\/[^/]+\/[^/]+/);
|
|
1228
|
-
return match?.[0] || "";
|
|
1229
|
-
}
|
|
1230
|
-
async scrape(options, progressCallback, signal) {
|
|
1231
|
-
const url = new URL(options.url);
|
|
1232
|
-
if (!url.hostname.includes("github.com")) {
|
|
1233
|
-
throw new Error("URL must be a GitHub URL");
|
|
1234
|
-
}
|
|
1235
|
-
await this.defaultStrategy.scrape(options, progressCallback, signal);
|
|
1236
|
-
}
|
|
1237
|
-
}
|
|
1238
|
-
class LocalFileStrategy extends BaseScraperStrategy {
|
|
1239
|
-
fileFetcher = new FileFetcher();
|
|
1240
|
-
htmlPipeline;
|
|
1241
|
-
markdownPipeline;
|
|
1242
|
-
pipelines;
|
|
1243
|
-
constructor() {
|
|
1244
|
-
super();
|
|
1245
|
-
this.htmlPipeline = new HtmlPipeline();
|
|
1246
|
-
this.markdownPipeline = new MarkdownPipeline();
|
|
1247
|
-
this.pipelines = [this.htmlPipeline, this.markdownPipeline];
|
|
1248
|
-
}
|
|
1249
|
-
canHandle(url) {
|
|
1250
|
-
return url.startsWith("file://");
|
|
1251
|
-
}
|
|
1252
|
-
async processItem(item, options, _progressCallback, _signal) {
|
|
1253
|
-
const filePath = item.url.replace(/^file:\/\//, "");
|
|
1254
|
-
const stats = await fs.stat(filePath);
|
|
1255
|
-
if (stats.isDirectory()) {
|
|
1256
|
-
const contents = await fs.readdir(filePath);
|
|
1257
|
-
return {
|
|
1258
|
-
links: contents.map((name) => `file://${path.join(filePath, name)}`)
|
|
1259
|
-
};
|
|
1260
|
-
}
|
|
1261
|
-
logger.info(`📄 Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
|
|
1262
|
-
const rawContent = await this.fileFetcher.fetch(item.url);
|
|
1263
|
-
let processed;
|
|
1264
|
-
for (const pipeline of this.pipelines) {
|
|
1265
|
-
if (pipeline.canProcess(rawContent)) {
|
|
1266
|
-
processed = await pipeline.process(rawContent, options, this.fileFetcher);
|
|
1267
|
-
break;
|
|
1268
|
-
}
|
|
1269
|
-
}
|
|
1270
|
-
if (!processed) {
|
|
1271
|
-
logger.warn(
|
|
1272
|
-
`Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
|
|
1273
|
-
);
|
|
1274
|
-
return { document: void 0, links: [] };
|
|
1275
|
-
}
|
|
1276
|
-
for (const err of processed.errors) {
|
|
1277
|
-
logger.warn(`Processing error for ${filePath}: ${err.message}`);
|
|
1278
|
-
}
|
|
1279
|
-
return {
|
|
1280
|
-
document: {
|
|
1281
|
-
content: typeof processed.textContent === "string" ? processed.textContent : "",
|
|
1282
|
-
metadata: {
|
|
1283
|
-
url: rawContent.source,
|
|
1284
|
-
title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
|
|
1285
|
-
library: options.library,
|
|
1286
|
-
version: options.version
|
|
1287
|
-
}
|
|
1288
|
-
}
|
|
1289
|
-
};
|
|
1290
|
-
}
|
|
1291
|
-
async scrape(options, progressCallback, signal) {
|
|
1292
|
-
try {
|
|
1293
|
-
await super.scrape(options, progressCallback, signal);
|
|
1294
|
-
} finally {
|
|
1295
|
-
await this.htmlPipeline.close();
|
|
1296
|
-
await this.markdownPipeline.close();
|
|
1297
|
-
}
|
|
1298
|
-
}
|
|
1299
|
-
}
|
|
1300
|
-
class NpmScraperStrategy {
|
|
1301
|
-
defaultStrategy;
|
|
1302
|
-
canHandle(url) {
|
|
1303
|
-
const { hostname } = new URL(url);
|
|
1304
|
-
return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname);
|
|
1305
|
-
}
|
|
1306
|
-
constructor() {
|
|
1307
|
-
this.defaultStrategy = new WebScraperStrategy({
|
|
1308
|
-
urlNormalizerOptions: {
|
|
1309
|
-
ignoreCase: true,
|
|
1310
|
-
removeHash: true,
|
|
1311
|
-
removeTrailingSlash: true,
|
|
1312
|
-
removeQuery: true
|
|
1313
|
-
// Enable removeQuery for NPM packages
|
|
1314
|
-
}
|
|
1315
|
-
});
|
|
1316
|
-
}
|
|
1317
|
-
async scrape(options, progressCallback, signal) {
|
|
1318
|
-
await this.defaultStrategy.scrape(options, progressCallback, signal);
|
|
1319
|
-
}
|
|
1320
|
-
}
|
|
1321
|
-
class PyPiScraperStrategy {
|
|
1322
|
-
defaultStrategy;
|
|
1323
|
-
canHandle(url) {
|
|
1324
|
-
const { hostname } = new URL(url);
|
|
1325
|
-
return ["pypi.org", "www.pypi.org"].includes(hostname);
|
|
1326
|
-
}
|
|
1327
|
-
constructor() {
|
|
1328
|
-
this.defaultStrategy = new WebScraperStrategy({
|
|
1329
|
-
urlNormalizerOptions: {
|
|
1330
|
-
ignoreCase: true,
|
|
1331
|
-
removeHash: true,
|
|
1332
|
-
removeTrailingSlash: true,
|
|
1333
|
-
removeQuery: true
|
|
1334
|
-
// Enable removeQuery for PyPI packages
|
|
1335
|
-
}
|
|
1336
|
-
});
|
|
1337
|
-
}
|
|
1338
|
-
async scrape(options, progressCallback, signal) {
|
|
1339
|
-
await this.defaultStrategy.scrape(options, progressCallback, signal);
|
|
1340
|
-
}
|
|
1341
|
-
}
|
|
1342
|
-
class ScraperRegistry {
|
|
1343
|
-
strategies;
|
|
1344
|
-
constructor() {
|
|
1345
|
-
this.strategies = [
|
|
1346
|
-
new NpmScraperStrategy(),
|
|
1347
|
-
new PyPiScraperStrategy(),
|
|
1348
|
-
new GitHubScraperStrategy(),
|
|
1349
|
-
new WebScraperStrategy(),
|
|
1350
|
-
new LocalFileStrategy()
|
|
1351
|
-
];
|
|
1352
|
-
}
|
|
1353
|
-
getStrategy(url) {
|
|
1354
|
-
validateUrl(url);
|
|
1355
|
-
const strategy = this.strategies.find((s) => s.canHandle(url));
|
|
1356
|
-
if (!strategy) {
|
|
1357
|
-
throw new ScraperError(`No strategy found for URL: ${url}`);
|
|
1358
|
-
}
|
|
1359
|
-
return strategy;
|
|
1360
|
-
}
|
|
1361
|
-
}
|
|
1362
|
-
class ScraperService {
|
|
1363
|
-
registry;
|
|
1364
|
-
constructor(registry) {
|
|
1365
|
-
this.registry = registry;
|
|
1366
|
-
}
|
|
1367
|
-
/**
|
|
1368
|
-
* Scrapes content from the provided URL using the appropriate strategy.
|
|
1369
|
-
* Reports progress via callback and handles errors.
|
|
1370
|
-
*/
|
|
1371
|
-
async scrape(options, progressCallback, signal) {
|
|
1372
|
-
const strategy = this.registry.getStrategy(options.url);
|
|
1373
|
-
if (!strategy) {
|
|
1374
|
-
throw new ScraperError(`No scraper strategy found for URL: ${options.url}`, false);
|
|
1375
|
-
}
|
|
1376
|
-
await strategy.scrape(options, progressCallback, signal);
|
|
1377
|
-
}
|
|
1378
|
-
}
|
|
1379
|
-
class PipelineWorker {
|
|
1380
|
-
// Dependencies are passed in, making the worker stateless regarding specific jobs
|
|
1381
|
-
store;
|
|
1382
|
-
scraperService;
|
|
1383
|
-
// Constructor accepts dependencies needed for execution
|
|
1384
|
-
constructor(store, scraperService) {
|
|
1385
|
-
this.store = store;
|
|
1386
|
-
this.scraperService = scraperService;
|
|
1387
|
-
}
|
|
1388
|
-
/**
|
|
1389
|
-
* Executes the given pipeline job.
|
|
1390
|
-
* @param job - The job to execute.
|
|
1391
|
-
* @param callbacks - Callbacks provided by the manager for reporting.
|
|
1392
|
-
*/
|
|
1393
|
-
async executeJob(job, callbacks) {
|
|
1394
|
-
const { id: jobId, library, version, options, abortController } = job;
|
|
1395
|
-
const signal = abortController.signal;
|
|
1396
|
-
logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
|
|
1397
|
-
try {
|
|
1398
|
-
await this.scraperService.scrape(
|
|
1399
|
-
options,
|
|
1400
|
-
async (progress) => {
|
|
1401
|
-
if (signal.aborted) {
|
|
1402
|
-
throw new CancellationError("Job cancelled during scraping progress");
|
|
1403
|
-
}
|
|
1404
|
-
job.progress = progress;
|
|
1405
|
-
await callbacks.onJobProgress?.(job, progress);
|
|
1406
|
-
if (progress.document) {
|
|
1407
|
-
try {
|
|
1408
|
-
await this.store.addDocument(library, version, {
|
|
1409
|
-
pageContent: progress.document.content,
|
|
1410
|
-
metadata: progress.document.metadata
|
|
1411
|
-
});
|
|
1412
|
-
logger.debug(
|
|
1413
|
-
`[${jobId}] Stored document: ${progress.document.metadata.url}`
|
|
1414
|
-
);
|
|
1415
|
-
} catch (docError) {
|
|
1416
|
-
logger.error(
|
|
1417
|
-
`[${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`
|
|
1418
|
-
);
|
|
1419
|
-
await callbacks.onJobError?.(
|
|
1420
|
-
job,
|
|
1421
|
-
docError instanceof Error ? docError : new Error(String(docError)),
|
|
1422
|
-
progress.document
|
|
1423
|
-
);
|
|
1424
|
-
}
|
|
1425
|
-
}
|
|
1426
|
-
},
|
|
1427
|
-
signal
|
|
1428
|
-
// Pass signal to scraper service
|
|
1429
|
-
);
|
|
1430
|
-
if (signal.aborted) {
|
|
1431
|
-
throw new CancellationError("Job cancelled shortly after scraping finished");
|
|
1432
|
-
}
|
|
1433
|
-
logger.debug(`[${jobId}] Worker finished job successfully.`);
|
|
1434
|
-
} catch (error) {
|
|
1435
|
-
logger.warn(`[${jobId}] Worker encountered error: ${error}`);
|
|
1436
|
-
throw error;
|
|
1437
|
-
}
|
|
1438
|
-
}
|
|
1439
|
-
// --- Old methods removed ---
|
|
1440
|
-
// process()
|
|
1441
|
-
// stop()
|
|
1442
|
-
// setCallbacks()
|
|
1443
|
-
// handleScrapingProgress()
|
|
1444
|
-
}
|
|
1445
|
-
var PipelineJobStatus = /* @__PURE__ */ ((PipelineJobStatus2) => {
|
|
1446
|
-
PipelineJobStatus2["QUEUED"] = "queued";
|
|
1447
|
-
PipelineJobStatus2["RUNNING"] = "running";
|
|
1448
|
-
PipelineJobStatus2["COMPLETED"] = "completed";
|
|
1449
|
-
PipelineJobStatus2["FAILED"] = "failed";
|
|
1450
|
-
PipelineJobStatus2["CANCELLING"] = "cancelling";
|
|
1451
|
-
PipelineJobStatus2["CANCELLED"] = "cancelled";
|
|
1452
|
-
return PipelineJobStatus2;
|
|
1453
|
-
})(PipelineJobStatus || {});
|
|
1454
|
-
const DEFAULT_CONCURRENCY = 3;
|
|
1455
|
-
class PipelineManager {
|
|
1456
|
-
jobMap = /* @__PURE__ */ new Map();
|
|
1457
|
-
jobQueue = [];
|
|
1458
|
-
activeWorkers = /* @__PURE__ */ new Set();
|
|
1459
|
-
isRunning = false;
|
|
1460
|
-
concurrency;
|
|
1461
|
-
callbacks = {};
|
|
1462
|
-
store;
|
|
1463
|
-
scraperService;
|
|
1464
|
-
constructor(store, concurrency = DEFAULT_CONCURRENCY) {
|
|
1465
|
-
this.store = store;
|
|
1466
|
-
this.concurrency = concurrency;
|
|
1467
|
-
const registry = new ScraperRegistry();
|
|
1468
|
-
this.scraperService = new ScraperService(registry);
|
|
1469
|
-
}
|
|
1470
|
-
/**
|
|
1471
|
-
* Registers callback handlers for pipeline manager events.
|
|
1472
|
-
*/
|
|
1473
|
-
setCallbacks(callbacks) {
|
|
1474
|
-
this.callbacks = callbacks;
|
|
1475
|
-
}
|
|
1476
|
-
/**
|
|
1477
|
-
* Starts the pipeline manager's worker processing.
|
|
1478
|
-
*/
|
|
1479
|
-
async start() {
|
|
1480
|
-
if (this.isRunning) {
|
|
1481
|
-
logger.warn("PipelineManager is already running.");
|
|
1482
|
-
return;
|
|
1483
|
-
}
|
|
1484
|
-
this.isRunning = true;
|
|
1485
|
-
logger.debug(`PipelineManager started with concurrency ${this.concurrency}.`);
|
|
1486
|
-
this._processQueue();
|
|
1487
|
-
}
|
|
1488
|
-
/**
|
|
1489
|
-
* Stops the pipeline manager and attempts to gracefully shut down workers.
|
|
1490
|
-
* Currently, it just stops processing new jobs. Cancellation of active jobs
|
|
1491
|
-
* needs explicit `cancelJob` calls.
|
|
1492
|
-
*/
|
|
1493
|
-
async stop() {
|
|
1494
|
-
if (!this.isRunning) {
|
|
1495
|
-
logger.warn("PipelineManager is not running.");
|
|
1496
|
-
return;
|
|
1497
|
-
}
|
|
1498
|
-
this.isRunning = false;
|
|
1499
|
-
logger.debug("PipelineManager stopping. No new jobs will be started.");
|
|
1500
|
-
}
|
|
1501
|
-
/**
|
|
1502
|
-
* Enqueues a new document processing job.
|
|
1503
|
-
*/
|
|
1504
|
-
async enqueueJob(library, version, options) {
|
|
1505
|
-
const jobId = v4();
|
|
1506
|
-
const abortController = new AbortController();
|
|
1507
|
-
let resolveCompletion;
|
|
1508
|
-
let rejectCompletion;
|
|
1509
|
-
const completionPromise = new Promise((resolve, reject) => {
|
|
1510
|
-
resolveCompletion = resolve;
|
|
1511
|
-
rejectCompletion = reject;
|
|
1512
|
-
});
|
|
1513
|
-
const job = {
|
|
1514
|
-
id: jobId,
|
|
1515
|
-
library,
|
|
1516
|
-
version,
|
|
1517
|
-
options,
|
|
1518
|
-
status: PipelineJobStatus.QUEUED,
|
|
1519
|
-
progress: null,
|
|
1520
|
-
error: null,
|
|
1521
|
-
createdAt: /* @__PURE__ */ new Date(),
|
|
1522
|
-
startedAt: null,
|
|
1523
|
-
finishedAt: null,
|
|
1524
|
-
abortController,
|
|
1525
|
-
completionPromise,
|
|
1526
|
-
resolveCompletion,
|
|
1527
|
-
rejectCompletion
|
|
1528
|
-
};
|
|
1529
|
-
this.jobMap.set(jobId, job);
|
|
1530
|
-
this.jobQueue.push(jobId);
|
|
1531
|
-
logger.info(`📝 Job enqueued: ${jobId} for ${library}@${version}`);
|
|
1532
|
-
await this.callbacks.onJobStatusChange?.(job);
|
|
1533
|
-
if (this.isRunning) {
|
|
1534
|
-
this._processQueue();
|
|
1535
|
-
}
|
|
1536
|
-
return jobId;
|
|
1537
|
-
}
|
|
1538
|
-
/**
|
|
1539
|
-
* Retrieves the current state of a specific job.
|
|
1540
|
-
*/
|
|
1541
|
-
async getJob(jobId) {
|
|
1542
|
-
return this.jobMap.get(jobId);
|
|
1543
|
-
}
|
|
1544
|
-
/**
|
|
1545
|
-
* Retrieves the current state of all jobs (or a subset based on status).
|
|
1546
|
-
*/
|
|
1547
|
-
async getJobs(status) {
|
|
1548
|
-
const allJobs = Array.from(this.jobMap.values());
|
|
1549
|
-
if (status) {
|
|
1550
|
-
return allJobs.filter((job) => job.status === status);
|
|
1551
|
-
}
|
|
1552
|
-
return allJobs;
|
|
1553
|
-
}
|
|
1554
|
-
/**
|
|
1555
|
-
* Returns a promise that resolves when the specified job completes, fails, or is cancelled.
|
|
1556
|
-
*/
|
|
1557
|
-
async waitForJobCompletion(jobId) {
|
|
1558
|
-
const job = this.jobMap.get(jobId);
|
|
1559
|
-
if (!job) {
|
|
1560
|
-
throw new PipelineStateError(`Job not found: ${jobId}`);
|
|
1561
|
-
}
|
|
1562
|
-
await job.completionPromise;
|
|
1563
|
-
}
|
|
1564
|
-
/**
|
|
1565
|
-
* Attempts to cancel a queued or running job.
|
|
1566
|
-
*/
|
|
1567
|
-
async cancelJob(jobId) {
|
|
1568
|
-
const job = this.jobMap.get(jobId);
|
|
1569
|
-
if (!job) {
|
|
1570
|
-
logger.warn(`Attempted to cancel non-existent job: ${jobId}`);
|
|
1571
|
-
return;
|
|
1572
|
-
}
|
|
1573
|
-
switch (job.status) {
|
|
1574
|
-
case PipelineJobStatus.QUEUED:
|
|
1575
|
-
this.jobQueue = this.jobQueue.filter((id) => id !== jobId);
|
|
1576
|
-
job.status = PipelineJobStatus.CANCELLED;
|
|
1577
|
-
job.finishedAt = /* @__PURE__ */ new Date();
|
|
1578
|
-
logger.info(`🚫 Job cancelled (was queued): ${jobId}`);
|
|
1579
|
-
await this.callbacks.onJobStatusChange?.(job);
|
|
1580
|
-
job.rejectCompletion(new PipelineStateError("Job cancelled before starting"));
|
|
1581
|
-
break;
|
|
1582
|
-
case PipelineJobStatus.RUNNING:
|
|
1583
|
-
job.status = PipelineJobStatus.CANCELLING;
|
|
1584
|
-
job.abortController.abort();
|
|
1585
|
-
logger.info(`🚫 Signalling cancellation for running job: ${jobId}`);
|
|
1586
|
-
await this.callbacks.onJobStatusChange?.(job);
|
|
1587
|
-
break;
|
|
1588
|
-
case PipelineJobStatus.COMPLETED:
|
|
1589
|
-
case PipelineJobStatus.FAILED:
|
|
1590
|
-
case PipelineJobStatus.CANCELLED:
|
|
1591
|
-
case PipelineJobStatus.CANCELLING:
|
|
1592
|
-
logger.warn(
|
|
1593
|
-
`Job ${jobId} cannot be cancelled in its current state: ${job.status}`
|
|
1594
|
-
);
|
|
1595
|
-
break;
|
|
1596
|
-
default:
|
|
1597
|
-
logger.error(`Unhandled job status for cancellation: ${job.status}`);
|
|
1598
|
-
break;
|
|
1599
|
-
}
|
|
1600
|
-
}
|
|
1601
|
-
// --- Private Methods ---
|
|
1602
|
-
/**
|
|
1603
|
-
* Processes the job queue, starting new workers if capacity allows.
|
|
1604
|
-
*/
|
|
1605
|
-
_processQueue() {
|
|
1606
|
-
if (!this.isRunning) return;
|
|
1607
|
-
while (this.activeWorkers.size < this.concurrency && this.jobQueue.length > 0) {
|
|
1608
|
-
const jobId = this.jobQueue.shift();
|
|
1609
|
-
if (!jobId) continue;
|
|
1610
|
-
const job = this.jobMap.get(jobId);
|
|
1611
|
-
if (!job || job.status !== PipelineJobStatus.QUEUED) {
|
|
1612
|
-
logger.warn(`Skipping job ${jobId} in queue (not found or not queued).`);
|
|
1613
|
-
continue;
|
|
1614
|
-
}
|
|
1615
|
-
this.activeWorkers.add(jobId);
|
|
1616
|
-
job.status = PipelineJobStatus.RUNNING;
|
|
1617
|
-
job.startedAt = /* @__PURE__ */ new Date();
|
|
1618
|
-
this.callbacks.onJobStatusChange?.(job);
|
|
1619
|
-
this._runJob(job).catch((error) => {
|
|
1620
|
-
logger.error(`Unhandled error during job ${jobId} execution: ${error}`);
|
|
1621
|
-
if (job.status !== PipelineJobStatus.FAILED && job.status !== PipelineJobStatus.CANCELLED) {
|
|
1622
|
-
job.status = PipelineJobStatus.FAILED;
|
|
1623
|
-
job.error = error instanceof Error ? error : new Error(String(error));
|
|
1624
|
-
job.finishedAt = /* @__PURE__ */ new Date();
|
|
1625
|
-
this.callbacks.onJobStatusChange?.(job);
|
|
1626
|
-
job.rejectCompletion(job.error);
|
|
1627
|
-
}
|
|
1628
|
-
this.activeWorkers.delete(jobId);
|
|
1629
|
-
this._processQueue();
|
|
1630
|
-
});
|
|
1631
|
-
}
|
|
1632
|
-
}
|
|
1633
|
-
/**
|
|
1634
|
-
* Executes a single pipeline job by delegating to a PipelineWorker.
|
|
1635
|
-
* Handles final status updates and promise resolution/rejection.
|
|
1636
|
-
*/
|
|
1637
|
-
async _runJob(job) {
|
|
1638
|
-
const { id: jobId, abortController } = job;
|
|
1639
|
-
const signal = abortController.signal;
|
|
1640
|
-
const worker = new PipelineWorker(this.store, this.scraperService);
|
|
1641
|
-
try {
|
|
1642
|
-
await worker.executeJob(job, this.callbacks);
|
|
1643
|
-
if (signal.aborted) {
|
|
1644
|
-
throw new CancellationError("Job cancelled just before completion");
|
|
1645
|
-
}
|
|
1646
|
-
job.status = PipelineJobStatus.COMPLETED;
|
|
1647
|
-
job.finishedAt = /* @__PURE__ */ new Date();
|
|
1648
|
-
await this.callbacks.onJobStatusChange?.(job);
|
|
1649
|
-
job.resolveCompletion();
|
|
1650
|
-
} catch (error) {
|
|
1651
|
-
if (error instanceof CancellationError || signal.aborted) {
|
|
1652
|
-
job.status = PipelineJobStatus.CANCELLED;
|
|
1653
|
-
job.finishedAt = /* @__PURE__ */ new Date();
|
|
1654
|
-
job.error = error instanceof CancellationError ? error : new CancellationError("Job cancelled by signal");
|
|
1655
|
-
logger.info(`🚫 Job execution cancelled: ${jobId}: ${job.error.message}`);
|
|
1656
|
-
await this.callbacks.onJobStatusChange?.(job);
|
|
1657
|
-
job.rejectCompletion(job.error);
|
|
1658
|
-
} else {
|
|
1659
|
-
job.status = PipelineJobStatus.FAILED;
|
|
1660
|
-
job.error = error instanceof Error ? error : new Error(String(error));
|
|
1661
|
-
job.finishedAt = /* @__PURE__ */ new Date();
|
|
1662
|
-
logger.error(`❌ Job failed: ${jobId}: ${job.error}`);
|
|
1663
|
-
await this.callbacks.onJobStatusChange?.(job);
|
|
1664
|
-
job.rejectCompletion(job.error);
|
|
1665
|
-
}
|
|
1666
|
-
} finally {
|
|
1667
|
-
this.activeWorkers.delete(jobId);
|
|
1668
|
-
this._processQueue();
|
|
1669
|
-
}
|
|
1670
|
-
}
|
|
1671
|
-
}
|
|
1672
|
-
const fullTrim = (str) => {
|
|
1673
|
-
return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
|
|
1674
|
-
};
|
|
1675
|
-
class SplitterError extends Error {
|
|
1676
|
-
}
|
|
1677
|
-
class MinimumChunkSizeError extends SplitterError {
|
|
1678
|
-
constructor(size, maxSize) {
|
|
1679
|
-
super(
|
|
1680
|
-
`Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
|
|
1681
|
-
);
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1684
|
-
class ContentSplitterError extends SplitterError {
|
|
1685
|
-
}
|
|
1686
|
-
class CodeContentSplitter {
|
|
1687
|
-
constructor(options) {
|
|
1688
|
-
this.options = options;
|
|
1689
|
-
}
|
|
1690
|
-
async split(content) {
|
|
1691
|
-
const language = content.match(/^```(\w+)\n/)?.[1];
|
|
1692
|
-
const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
|
|
1693
|
-
const lines = strippedContent.split("\n");
|
|
1694
|
-
const chunks = [];
|
|
1695
|
-
let currentChunkLines = [];
|
|
1696
|
-
for (const line of lines) {
|
|
1697
|
-
const singleLineSize = this.wrap(line, language).length;
|
|
1698
|
-
if (singleLineSize > this.options.chunkSize) {
|
|
1699
|
-
throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
|
|
1700
|
-
}
|
|
1701
|
-
currentChunkLines.push(line);
|
|
1702
|
-
const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
|
|
1703
|
-
const newChunkSize = newChunkContent.length;
|
|
1704
|
-
if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
|
|
1705
|
-
const lastLine = currentChunkLines.pop();
|
|
1706
|
-
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
1707
|
-
currentChunkLines = [lastLine];
|
|
1708
|
-
}
|
|
1709
|
-
}
|
|
1710
|
-
if (currentChunkLines.length > 0) {
|
|
1711
|
-
chunks.push(this.wrap(currentChunkLines.join("\n"), language));
|
|
1712
|
-
}
|
|
1713
|
-
return chunks;
|
|
1714
|
-
}
|
|
1715
|
-
wrap(content, language) {
|
|
1716
|
-
return `\`\`\`${language || ""}
|
|
1717
|
-
${content.replace(/\n+$/, "")}
|
|
1718
|
-
\`\`\``;
|
|
1719
|
-
}
|
|
1720
|
-
}
|
|
1721
|
-
class TableContentSplitter {
|
|
1722
|
-
constructor(options) {
|
|
1723
|
-
this.options = options;
|
|
1724
|
-
}
|
|
1725
|
-
/**
|
|
1726
|
-
* Splits table content into chunks while preserving table structure
|
|
1727
|
-
*/
|
|
1728
|
-
async split(content) {
|
|
1729
|
-
const parsedTable = this.parseTable(content);
|
|
1730
|
-
if (!parsedTable) {
|
|
1731
|
-
return [content];
|
|
1732
|
-
}
|
|
1733
|
-
const { headers, rows } = parsedTable;
|
|
1734
|
-
const chunks = [];
|
|
1735
|
-
let currentRows = [];
|
|
1736
|
-
for (const row of rows) {
|
|
1737
|
-
const singleRowSize = this.wrap(row, headers).length;
|
|
1738
|
-
if (singleRowSize > this.options.chunkSize) {
|
|
1739
|
-
throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
|
|
1740
|
-
}
|
|
1741
|
-
const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
|
|
1742
|
-
const newChunkSize = newChunkContent.length;
|
|
1743
|
-
if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
|
|
1744
|
-
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
1745
|
-
currentRows = [row];
|
|
1746
|
-
} else {
|
|
1747
|
-
currentRows.push(row);
|
|
1748
|
-
}
|
|
1749
|
-
}
|
|
1750
|
-
if (currentRows.length > 0) {
|
|
1751
|
-
chunks.push(this.wrap(currentRows.join("\n"), headers));
|
|
1752
|
-
}
|
|
1753
|
-
return chunks;
|
|
1754
|
-
}
|
|
1755
|
-
wrap(content, headers) {
|
|
1756
|
-
const headerRow = `| ${headers.join(" | ")} |`;
|
|
1757
|
-
const separatorRow = `|${headers.map(() => "---").join("|")}|`;
|
|
1758
|
-
return [headerRow, separatorRow, content].join("\n");
|
|
1759
|
-
}
|
|
1760
|
-
parseTable(content) {
|
|
1761
|
-
const lines = content.trim().split("\n");
|
|
1762
|
-
if (lines.length < 3) return null;
|
|
1763
|
-
const headers = this.parseRow(lines[0]);
|
|
1764
|
-
if (!headers) return null;
|
|
1765
|
-
const separator = lines[1];
|
|
1766
|
-
if (!this.isValidSeparator(separator)) return null;
|
|
1767
|
-
const rows = lines.slice(2).filter((row) => row.trim() !== "");
|
|
1768
|
-
return { headers, separator, rows };
|
|
1769
|
-
}
|
|
1770
|
-
/**
|
|
1771
|
-
* Parses a table row into cells
|
|
1772
|
-
*/
|
|
1773
|
-
parseRow(row) {
|
|
1774
|
-
if (!row.includes("|")) return null;
|
|
1775
|
-
return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
|
|
1776
|
-
}
|
|
1777
|
-
/**
|
|
1778
|
-
* Validates the separator row of the table
|
|
1779
|
-
*/
|
|
1780
|
-
isValidSeparator(separator) {
|
|
1781
|
-
return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
|
|
1782
|
-
}
|
|
1783
|
-
}
|
|
1784
|
-
class TextContentSplitter {
|
|
1785
|
-
constructor(options) {
|
|
1786
|
-
this.options = options;
|
|
1787
|
-
}
|
|
1788
|
-
/**
|
|
1789
|
-
* Splits text content into chunks while trying to preserve semantic boundaries.
|
|
1790
|
-
* Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
|
|
1791
|
-
*/
|
|
1792
|
-
async split(content) {
|
|
1793
|
-
const trimmedContent = fullTrim(content);
|
|
1794
|
-
if (trimmedContent.length <= this.options.chunkSize) {
|
|
1795
|
-
return [trimmedContent];
|
|
1796
|
-
}
|
|
1797
|
-
const words = trimmedContent.split(/\s+/);
|
|
1798
|
-
const longestWord = words.reduce(
|
|
1799
|
-
(max, word) => word.length > max.length ? word : max
|
|
1800
|
-
);
|
|
1801
|
-
if (longestWord.length > this.options.chunkSize) {
|
|
1802
|
-
throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
|
|
1803
|
-
}
|
|
1804
|
-
const paragraphChunks = this.splitByParagraphs(trimmedContent);
|
|
1805
|
-
if (this.areChunksValid(paragraphChunks)) {
|
|
1806
|
-
return paragraphChunks;
|
|
1807
|
-
}
|
|
1808
|
-
const lineChunks = this.splitByLines(trimmedContent);
|
|
1809
|
-
if (this.areChunksValid(lineChunks)) {
|
|
1810
|
-
return this.mergeChunks(lineChunks, "\n");
|
|
1811
|
-
}
|
|
1812
|
-
const wordChunks = await this.splitByWords(trimmedContent);
|
|
1813
|
-
return this.mergeChunks(wordChunks, " ");
|
|
1814
|
-
}
|
|
1815
|
-
/**
|
|
1816
|
-
* Checks if all chunks are within the maximum size limit
|
|
1817
|
-
*/
|
|
1818
|
-
areChunksValid(chunks) {
|
|
1819
|
-
return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
|
|
1820
|
-
}
|
|
1821
|
-
/**
|
|
1822
|
-
* Splits text into chunks by paragraph boundaries (double newlines)
|
|
1823
|
-
*/
|
|
1824
|
-
splitByParagraphs(text) {
|
|
1825
|
-
const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
|
|
1826
|
-
return paragraphs.filter((chunk) => chunk.length > 2);
|
|
1827
|
-
}
|
|
1828
|
-
/**
|
|
1829
|
-
* Splits text into chunks by line boundaries
|
|
1830
|
-
*/
|
|
1831
|
-
splitByLines(text) {
|
|
1832
|
-
const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
|
|
1833
|
-
return lines.filter((chunk) => chunk.length > 1);
|
|
1834
|
-
}
|
|
1835
|
-
/**
|
|
1836
|
-
* Uses LangChain's recursive splitter for word-based splitting as a last resort
|
|
1837
|
-
*/
|
|
1838
|
-
async splitByWords(text) {
|
|
1839
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
1840
|
-
chunkSize: this.options.chunkSize,
|
|
1841
|
-
chunkOverlap: 0
|
|
1842
|
-
});
|
|
1843
|
-
const chunks = await splitter.splitText(text);
|
|
1844
|
-
return chunks;
|
|
1845
|
-
}
|
|
1846
|
-
/**
|
|
1847
|
-
* Attempts to merge small chunks with previous chunks to minimize fragmentation.
|
|
1848
|
-
* Only merges if combined size is within maxChunkSize.
|
|
1849
|
-
*/
|
|
1850
|
-
mergeChunks(chunks, separator) {
|
|
1851
|
-
const mergedChunks = [];
|
|
1852
|
-
let currentChunk = null;
|
|
1853
|
-
for (const chunk of chunks) {
|
|
1854
|
-
if (currentChunk === null) {
|
|
1855
|
-
currentChunk = chunk;
|
|
1856
|
-
continue;
|
|
1857
|
-
}
|
|
1858
|
-
const currentChunkSize = this.getChunkSize(currentChunk);
|
|
1859
|
-
const nextChunkSize = this.getChunkSize(chunk);
|
|
1860
|
-
if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
|
|
1861
|
-
currentChunk = `${currentChunk}${separator}${chunk}`;
|
|
1862
|
-
} else {
|
|
1863
|
-
mergedChunks.push(currentChunk);
|
|
1864
|
-
currentChunk = chunk;
|
|
1865
|
-
}
|
|
1866
|
-
}
|
|
1867
|
-
if (currentChunk) {
|
|
1868
|
-
mergedChunks.push(currentChunk);
|
|
1869
|
-
}
|
|
1870
|
-
return mergedChunks;
|
|
1871
|
-
}
|
|
1872
|
-
getChunkSize(chunk) {
|
|
1873
|
-
return chunk.length;
|
|
1874
|
-
}
|
|
1875
|
-
wrap(content) {
|
|
1876
|
-
return content;
|
|
1877
|
-
}
|
|
1878
|
-
}
|
|
1879
|
-
class SemanticMarkdownSplitter {
|
|
1880
|
-
constructor(preferredChunkSize, maxChunkSize) {
|
|
1881
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
1882
|
-
this.maxChunkSize = maxChunkSize;
|
|
1883
|
-
this.turndownService = new TurndownService({
|
|
1884
|
-
headingStyle: "atx",
|
|
1885
|
-
hr: "---",
|
|
1886
|
-
bulletListMarker: "-",
|
|
1887
|
-
codeBlockStyle: "fenced",
|
|
1888
|
-
emDelimiter: "_",
|
|
1889
|
-
strongDelimiter: "**",
|
|
1890
|
-
linkStyle: "inlined"
|
|
1891
|
-
});
|
|
1892
|
-
this.turndownService.addRule("table", {
|
|
1893
|
-
filter: ["table"],
|
|
1894
|
-
replacement: (content, node) => {
|
|
1895
|
-
const table = node;
|
|
1896
|
-
const headers = Array.from(table.querySelectorAll("th")).map(
|
|
1897
|
-
(th) => th.textContent?.trim() || ""
|
|
1898
|
-
);
|
|
1899
|
-
const rows = Array.from(table.querySelectorAll("tr")).filter(
|
|
1900
|
-
(tr) => !tr.querySelector("th")
|
|
1901
|
-
);
|
|
1902
|
-
if (headers.length === 0 && rows.length === 0) return "";
|
|
1903
|
-
let markdown = "\n";
|
|
1904
|
-
if (headers.length > 0) {
|
|
1905
|
-
markdown += `| ${headers.join(" | ")} |
|
|
1906
|
-
`;
|
|
1907
|
-
markdown += `|${headers.map(() => "---").join("|")}|
|
|
1908
|
-
`;
|
|
1909
|
-
}
|
|
1910
|
-
for (const row of rows) {
|
|
1911
|
-
const cells = Array.from(row.querySelectorAll("td")).map(
|
|
1912
|
-
(td) => td.textContent?.trim() || ""
|
|
1913
|
-
);
|
|
1914
|
-
markdown += `| ${cells.join(" | ")} |
|
|
1915
|
-
`;
|
|
1916
|
-
}
|
|
1917
|
-
return markdown;
|
|
1918
|
-
}
|
|
1919
|
-
});
|
|
1920
|
-
this.textSplitter = new TextContentSplitter({
|
|
1921
|
-
chunkSize: this.preferredChunkSize
|
|
1922
|
-
});
|
|
1923
|
-
this.codeSplitter = new CodeContentSplitter({
|
|
1924
|
-
chunkSize: this.maxChunkSize
|
|
1925
|
-
});
|
|
1926
|
-
this.tableSplitter = new TableContentSplitter({
|
|
1927
|
-
chunkSize: this.maxChunkSize
|
|
1928
|
-
});
|
|
1929
|
-
}
|
|
1930
|
-
turndownService;
|
|
1931
|
-
textSplitter;
|
|
1932
|
-
codeSplitter;
|
|
1933
|
-
tableSplitter;
|
|
1934
|
-
/**
|
|
1935
|
-
* Main entry point for splitting markdown content
|
|
1936
|
-
*/
|
|
1937
|
-
async splitText(markdown) {
|
|
1938
|
-
const html = await this.markdownToHtml(markdown);
|
|
1939
|
-
const dom = await this.parseHtml(html);
|
|
1940
|
-
const sections = await this.splitIntoSections(dom);
|
|
1941
|
-
return this.splitSectionContent(sections);
|
|
1942
|
-
}
|
|
1943
|
-
/**
|
|
1944
|
-
* Step 1: Split document into sections based on H1-H6 headings,
|
|
1945
|
-
* as well as code blocks and tables.
|
|
1946
|
-
*/
|
|
1947
|
-
async splitIntoSections(dom) {
|
|
1948
|
-
const body = dom.querySelector("body");
|
|
1949
|
-
if (!body) {
|
|
1950
|
-
throw new Error("Invalid HTML structure: no body element found");
|
|
1951
|
-
}
|
|
1952
|
-
let currentSection = this.createRootSection();
|
|
1953
|
-
const sections = [];
|
|
1954
|
-
const stack = [currentSection];
|
|
1955
|
-
for (const element of Array.from(body.children)) {
|
|
1956
|
-
const headingMatch = element.tagName.match(/H([1-6])/);
|
|
1957
|
-
if (headingMatch) {
|
|
1958
|
-
const level = Number.parseInt(headingMatch[1], 10);
|
|
1959
|
-
const title = fullTrim(element.textContent || "");
|
|
1960
|
-
while (stack.length > 1 && stack[stack.length - 1].level >= level) {
|
|
1961
|
-
stack.pop();
|
|
1962
|
-
}
|
|
1963
|
-
currentSection = {
|
|
1964
|
-
level,
|
|
1965
|
-
path: [
|
|
1966
|
-
...stack.slice(1).reduce((acc, s) => {
|
|
1967
|
-
const lastPath = s.path[s.path.length - 1];
|
|
1968
|
-
if (lastPath) acc.push(lastPath);
|
|
1969
|
-
return acc;
|
|
1970
|
-
}, []),
|
|
1971
|
-
title
|
|
1972
|
-
],
|
|
1973
|
-
content: [
|
|
1974
|
-
{
|
|
1975
|
-
type: "heading",
|
|
1976
|
-
text: `${"#".repeat(level)} ${title}`
|
|
1977
|
-
}
|
|
1978
|
-
]
|
|
1979
|
-
};
|
|
1980
|
-
sections.push(currentSection);
|
|
1981
|
-
stack.push(currentSection);
|
|
1982
|
-
} else if (element.tagName === "PRE") {
|
|
1983
|
-
const code = element.querySelector("code");
|
|
1984
|
-
const language = code?.className.replace("language-", "") || "";
|
|
1985
|
-
const content = code?.textContent || element.textContent || "";
|
|
1986
|
-
const markdown = `${"```"}${language}
|
|
1987
|
-
${content}
|
|
1988
|
-
${"```"}`;
|
|
1989
|
-
currentSection = {
|
|
1990
|
-
level: currentSection.level,
|
|
1991
|
-
path: currentSection.path,
|
|
1992
|
-
content: [
|
|
1993
|
-
{
|
|
1994
|
-
type: "code",
|
|
1995
|
-
text: markdown
|
|
1996
|
-
}
|
|
1997
|
-
]
|
|
1998
|
-
};
|
|
1999
|
-
sections.push(currentSection);
|
|
2000
|
-
} else if (element.tagName === "TABLE") {
|
|
2001
|
-
const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
|
|
2002
|
-
currentSection = {
|
|
2003
|
-
level: currentSection.level,
|
|
2004
|
-
path: currentSection.path,
|
|
2005
|
-
content: [
|
|
2006
|
-
{
|
|
2007
|
-
type: "table",
|
|
2008
|
-
text: markdown
|
|
2009
|
-
}
|
|
2010
|
-
]
|
|
2011
|
-
};
|
|
2012
|
-
sections.push(currentSection);
|
|
2013
|
-
} else {
|
|
2014
|
-
const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
|
|
2015
|
-
if (markdown) {
|
|
2016
|
-
currentSection = {
|
|
2017
|
-
level: currentSection.level,
|
|
2018
|
-
path: currentSection.path,
|
|
2019
|
-
content: [
|
|
2020
|
-
{
|
|
2021
|
-
type: "text",
|
|
2022
|
-
text: markdown
|
|
2023
|
-
}
|
|
2024
|
-
]
|
|
2025
|
-
};
|
|
2026
|
-
sections.push(currentSection);
|
|
2027
|
-
}
|
|
2028
|
-
}
|
|
2029
|
-
}
|
|
2030
|
-
return sections;
|
|
2031
|
-
}
|
|
2032
|
-
/**
|
|
2033
|
-
* Step 2: Split section content into smaller chunks
|
|
2034
|
-
*/
|
|
2035
|
-
async splitSectionContent(sections) {
|
|
2036
|
-
const chunks = [];
|
|
2037
|
-
for (const section of sections) {
|
|
2038
|
-
for (const content of section.content) {
|
|
2039
|
-
let splitContent = [];
|
|
2040
|
-
try {
|
|
2041
|
-
switch (content.type) {
|
|
2042
|
-
case "heading":
|
|
2043
|
-
case "text": {
|
|
2044
|
-
splitContent = await this.textSplitter.split(content.text);
|
|
2045
|
-
break;
|
|
2046
|
-
}
|
|
2047
|
-
case "code": {
|
|
2048
|
-
splitContent = await this.codeSplitter.split(content.text);
|
|
2049
|
-
break;
|
|
2050
|
-
}
|
|
2051
|
-
case "table": {
|
|
2052
|
-
splitContent = await this.tableSplitter.split(content.text);
|
|
2053
|
-
break;
|
|
2054
|
-
}
|
|
2055
|
-
}
|
|
2056
|
-
} catch (err) {
|
|
2057
|
-
if (err instanceof MinimumChunkSizeError) {
|
|
2058
|
-
logger.warn(
|
|
2059
|
-
`⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
|
|
2060
|
-
);
|
|
2061
|
-
const splitter = new RecursiveCharacterTextSplitter({
|
|
2062
|
-
chunkSize: this.maxChunkSize,
|
|
2063
|
-
chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
|
|
2064
|
-
// Use more aggressive separators including empty string as last resort
|
|
2065
|
-
separators: [
|
|
2066
|
-
"\n\n",
|
|
2067
|
-
"\n",
|
|
2068
|
-
" ",
|
|
2069
|
-
" ",
|
|
2070
|
-
".",
|
|
2071
|
-
",",
|
|
2072
|
-
";",
|
|
2073
|
-
":",
|
|
2074
|
-
"-",
|
|
2075
|
-
"(",
|
|
2076
|
-
")",
|
|
2077
|
-
"[",
|
|
2078
|
-
"]",
|
|
2079
|
-
"{",
|
|
2080
|
-
"}",
|
|
2081
|
-
""
|
|
2082
|
-
]
|
|
2083
|
-
});
|
|
2084
|
-
const chunks2 = await splitter.splitText(content.text);
|
|
2085
|
-
if (chunks2.length === 0) {
|
|
2086
|
-
splitContent = [content.text.substring(0, this.maxChunkSize)];
|
|
2087
|
-
} else {
|
|
2088
|
-
splitContent = chunks2;
|
|
2089
|
-
}
|
|
2090
|
-
} else {
|
|
2091
|
-
const errMessage = err instanceof Error ? err.message : String(err);
|
|
2092
|
-
throw new ContentSplitterError(
|
|
2093
|
-
`Failed to split ${content.type} content: ${errMessage}`
|
|
2094
|
-
);
|
|
2095
|
-
}
|
|
2096
|
-
}
|
|
2097
|
-
chunks.push(
|
|
2098
|
-
...splitContent.map(
|
|
2099
|
-
(text) => ({
|
|
2100
|
-
types: [content.type],
|
|
2101
|
-
content: text,
|
|
2102
|
-
section: {
|
|
2103
|
-
level: section.level,
|
|
2104
|
-
path: section.path
|
|
2105
|
-
}
|
|
2106
|
-
})
|
|
2107
|
-
)
|
|
2108
|
-
);
|
|
2109
|
-
}
|
|
2110
|
-
}
|
|
2111
|
-
return chunks;
|
|
2112
|
-
}
|
|
2113
|
-
/**
|
|
2114
|
-
* Helper to create the root section
|
|
2115
|
-
*/
|
|
2116
|
-
createRootSection() {
|
|
2117
|
-
return {
|
|
2118
|
-
level: 0,
|
|
2119
|
-
path: [],
|
|
2120
|
-
content: []
|
|
2121
|
-
};
|
|
2122
|
-
}
|
|
2123
|
-
/**
|
|
2124
|
-
* Convert markdown to HTML using remark
|
|
2125
|
-
*/
|
|
2126
|
-
async markdownToHtml(markdown) {
|
|
2127
|
-
const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
|
|
2128
|
-
return `<!DOCTYPE html>
|
|
2129
|
-
<html>
|
|
2130
|
-
<body>
|
|
2131
|
-
${String(html)}
|
|
2132
|
-
</body>
|
|
2133
|
-
</html>`;
|
|
2134
|
-
}
|
|
2135
|
-
/**
|
|
2136
|
-
* Parse HTML
|
|
2137
|
-
*/
|
|
2138
|
-
async parseHtml(html) {
|
|
2139
|
-
const { window } = createJSDOM(html);
|
|
2140
|
-
return window.document;
|
|
2141
|
-
}
|
|
2142
|
-
}
|
|
2143
|
-
class GreedySplitter {
|
|
2144
|
-
baseSplitter;
|
|
2145
|
-
minChunkSize;
|
|
2146
|
-
preferredChunkSize;
|
|
2147
|
-
/**
|
|
2148
|
-
* Combines a base document splitter with size constraints to produce optimally-sized chunks.
|
|
2149
|
-
* The base splitter handles the initial semantic splitting, while this class handles
|
|
2150
|
-
* the concatenation strategy.
|
|
2151
|
-
*/
|
|
2152
|
-
constructor(baseSplitter, minChunkSize, preferredChunkSize) {
|
|
2153
|
-
this.baseSplitter = baseSplitter;
|
|
2154
|
-
this.minChunkSize = minChunkSize;
|
|
2155
|
-
this.preferredChunkSize = preferredChunkSize;
|
|
2156
|
-
}
|
|
2157
|
-
/**
|
|
2158
|
-
* Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
|
|
2159
|
-
* are combined until they reach the minimum size, but splits are preserved at major
|
|
2160
|
-
* section boundaries to maintain document structure. This balances the need for
|
|
2161
|
-
* context with semantic coherence.
|
|
2162
|
-
*/
|
|
2163
|
-
async splitText(markdown) {
|
|
2164
|
-
const initialChunks = await this.baseSplitter.splitText(markdown);
|
|
2165
|
-
const concatenatedChunks = [];
|
|
2166
|
-
let currentChunk = null;
|
|
2167
|
-
for (const nextChunk of initialChunks) {
|
|
2168
|
-
if (currentChunk) {
|
|
2169
|
-
if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
|
|
2170
|
-
concatenatedChunks.push(currentChunk);
|
|
2171
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2172
|
-
continue;
|
|
2173
|
-
}
|
|
2174
|
-
if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
|
|
2175
|
-
concatenatedChunks.push(currentChunk);
|
|
2176
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2177
|
-
continue;
|
|
2178
|
-
}
|
|
2179
|
-
currentChunk.content += `
|
|
2180
|
-
${nextChunk.content}`;
|
|
2181
|
-
currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
|
|
2182
|
-
currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
|
|
2183
|
-
} else {
|
|
2184
|
-
currentChunk = this.cloneChunk(nextChunk);
|
|
2185
|
-
}
|
|
2186
|
-
}
|
|
2187
|
-
if (currentChunk) {
|
|
2188
|
-
concatenatedChunks.push(currentChunk);
|
|
2189
|
-
}
|
|
2190
|
-
return concatenatedChunks;
|
|
2191
|
-
}
|
|
2192
|
-
cloneChunk(chunk) {
|
|
2193
|
-
return {
|
|
2194
|
-
types: [...chunk.types],
|
|
2195
|
-
content: chunk.content,
|
|
2196
|
-
section: {
|
|
2197
|
-
level: chunk.section.level,
|
|
2198
|
-
path: [...chunk.section.path]
|
|
2199
|
-
}
|
|
2200
|
-
};
|
|
2201
|
-
}
|
|
2202
|
-
/**
|
|
2203
|
-
* H1 and H2 headings represent major conceptual breaks in the document.
|
|
2204
|
-
* Preserving these splits helps maintain the document's logical structure.
|
|
2205
|
-
*/
|
|
2206
|
-
startsNewMajorSection(chunk) {
|
|
2207
|
-
return chunk.section.level === 1 || chunk.section.level === 2;
|
|
2208
|
-
}
|
|
2209
|
-
/**
|
|
2210
|
-
* Size limit check to ensure chunks remain within embedding model constraints.
|
|
2211
|
-
* Essential for maintaining consistent embedding quality and avoiding truncation.
|
|
2212
|
-
*/
|
|
2213
|
-
wouldExceedMaxSize(currentChunk, nextChunk) {
|
|
2214
|
-
if (!currentChunk) {
|
|
2215
|
-
return false;
|
|
2216
|
-
}
|
|
2217
|
-
return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
|
|
2218
|
-
}
|
|
2219
|
-
/**
|
|
2220
|
-
* Checks if one path is a prefix of another path, indicating a parent-child relationship
|
|
2221
|
-
*/
|
|
2222
|
-
isPathIncluded(parentPath, childPath) {
|
|
2223
|
-
if (parentPath.length >= childPath.length) return false;
|
|
2224
|
-
return parentPath.every((part, i) => part === childPath[i]);
|
|
2225
|
-
}
|
|
2226
|
-
/**
|
|
2227
|
-
* Merges section metadata when concatenating chunks, following these rules:
|
|
2228
|
-
* 1. Level: Always uses the lowest (most general) level between chunks
|
|
2229
|
-
* 2. Path selection:
|
|
2230
|
-
* - For parent-child relationships (one path includes the other), uses the child's path
|
|
2231
|
-
* - For siblings/unrelated sections, uses the common parent path
|
|
2232
|
-
* - If no common path exists, uses the root path ([])
|
|
2233
|
-
*/
|
|
2234
|
-
mergeSectionInfo(currentChunk, nextChunk) {
|
|
2235
|
-
const level = Math.min(currentChunk.section.level, nextChunk.section.level);
|
|
2236
|
-
if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
|
|
2237
|
-
return currentChunk.section;
|
|
2238
|
-
}
|
|
2239
|
-
if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
|
|
2240
|
-
return {
|
|
2241
|
-
path: nextChunk.section.path,
|
|
2242
|
-
level
|
|
2243
|
-
};
|
|
2244
|
-
}
|
|
2245
|
-
if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
|
|
2246
|
-
return {
|
|
2247
|
-
path: currentChunk.section.path,
|
|
2248
|
-
level
|
|
2249
|
-
};
|
|
2250
|
-
}
|
|
2251
|
-
const commonPath = this.findCommonPrefix(
|
|
2252
|
-
currentChunk.section.path,
|
|
2253
|
-
nextChunk.section.path
|
|
2254
|
-
);
|
|
2255
|
-
return {
|
|
2256
|
-
path: commonPath,
|
|
2257
|
-
level
|
|
2258
|
-
};
|
|
2259
|
-
}
|
|
2260
|
-
mergeTypes(currentTypes, nextTypes) {
|
|
2261
|
-
return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
|
|
2262
|
-
}
|
|
2263
|
-
/**
|
|
2264
|
-
* Returns longest common prefix between two paths
|
|
2265
|
-
*/
|
|
2266
|
-
findCommonPrefix(path1, path2) {
|
|
2267
|
-
const common = [];
|
|
2268
|
-
for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
|
|
2269
|
-
if (path1[i] === path2[i]) {
|
|
2270
|
-
common.push(path1[i]);
|
|
2271
|
-
} else {
|
|
2272
|
-
break;
|
|
2273
|
-
}
|
|
2274
|
-
}
|
|
2275
|
-
return common;
|
|
2276
|
-
}
|
|
2277
|
-
}
|
|
2278
|
-
class ToolError extends Error {
|
|
2279
|
-
constructor(message, toolName) {
|
|
2280
|
-
super(message);
|
|
2281
|
-
this.toolName = toolName;
|
|
2282
|
-
this.name = this.constructor.name;
|
|
2283
|
-
}
|
|
2284
|
-
}
|
|
2285
|
-
class VersionNotFoundError extends ToolError {
|
|
2286
|
-
constructor(library, requestedVersion, availableVersions) {
|
|
2287
|
-
super(
|
|
2288
|
-
`Version ${requestedVersion} not found for ${library}. Available versions: ${availableVersions.map((v) => v.version).join(", ")}`,
|
|
2289
|
-
"SearchTool"
|
|
2290
|
-
);
|
|
2291
|
-
this.library = library;
|
|
2292
|
-
this.requestedVersion = requestedVersion;
|
|
2293
|
-
this.availableVersions = availableVersions;
|
|
2294
|
-
}
|
|
2295
|
-
getLatestVersion() {
|
|
2296
|
-
return this.availableVersions.sort((a, b) => semver__default.compare(b.version, a.version))[0];
|
|
2297
|
-
}
|
|
2298
|
-
}
|
|
2299
|
-
class LibraryNotFoundError extends ToolError {
|
|
2300
|
-
constructor(requestedLibrary, suggestions = []) {
|
|
2301
|
-
let message = `Library '${requestedLibrary}' not found.`;
|
|
2302
|
-
if (suggestions.length > 0) {
|
|
2303
|
-
message += ` Did you mean one of these: ${suggestions.join(", ")}?`;
|
|
2304
|
-
}
|
|
2305
|
-
super(message, "SearchTool");
|
|
2306
|
-
this.requestedLibrary = requestedLibrary;
|
|
2307
|
-
this.suggestions = suggestions;
|
|
2308
|
-
}
|
|
2309
|
-
}
|
|
2310
|
-
class ListLibrariesTool {
|
|
2311
|
-
docService;
|
|
2312
|
-
constructor(docService) {
|
|
2313
|
-
this.docService = docService;
|
|
2314
|
-
}
|
|
2315
|
-
async execute(options) {
|
|
2316
|
-
const rawLibraries = await this.docService.listLibraries();
|
|
2317
|
-
const libraries = rawLibraries.map(({ library, versions }) => ({
|
|
2318
|
-
name: library,
|
|
2319
|
-
versions
|
|
2320
|
-
// Directly assign the detailed versions array
|
|
2321
|
-
}));
|
|
2322
|
-
return { libraries };
|
|
2323
|
-
}
|
|
2324
|
-
}
|
|
2325
|
-
class ScrapeTool {
|
|
2326
|
-
docService;
|
|
2327
|
-
manager;
|
|
2328
|
-
// Add manager property
|
|
2329
|
-
constructor(docService, manager) {
|
|
2330
|
-
this.docService = docService;
|
|
2331
|
-
this.manager = manager;
|
|
2332
|
-
}
|
|
2333
|
-
async execute(options) {
|
|
2334
|
-
const {
|
|
2335
|
-
library,
|
|
2336
|
-
version,
|
|
2337
|
-
url,
|
|
2338
|
-
options: scraperOptions,
|
|
2339
|
-
waitForCompletion = true
|
|
2340
|
-
} = options;
|
|
2341
|
-
let internalVersion;
|
|
2342
|
-
const partialVersionRegex = /^\d+(\.\d+)?$/;
|
|
2343
|
-
if (version === null || version === void 0) {
|
|
2344
|
-
internalVersion = "";
|
|
2345
|
-
} else {
|
|
2346
|
-
const validFullVersion = semver.valid(version);
|
|
2347
|
-
if (validFullVersion) {
|
|
2348
|
-
internalVersion = validFullVersion;
|
|
2349
|
-
} else if (partialVersionRegex.test(version)) {
|
|
2350
|
-
const coercedVersion = semver.coerce(version);
|
|
2351
|
-
if (coercedVersion) {
|
|
2352
|
-
internalVersion = coercedVersion.version;
|
|
2353
|
-
} else {
|
|
2354
|
-
throw new Error(
|
|
2355
|
-
`Invalid version format for scraping: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
|
|
2356
|
-
);
|
|
2357
|
-
}
|
|
2358
|
-
} else {
|
|
2359
|
-
throw new Error(
|
|
2360
|
-
`Invalid version format for scraping: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
|
|
2361
|
-
);
|
|
2362
|
-
}
|
|
2363
|
-
}
|
|
2364
|
-
internalVersion = internalVersion.toLowerCase();
|
|
2365
|
-
await this.docService.removeAllDocuments(library, internalVersion);
|
|
2366
|
-
logger.info(
|
|
2367
|
-
`💾 Cleared store for ${library}@${internalVersion || "[no version]"} before scraping.`
|
|
2368
|
-
);
|
|
2369
|
-
const manager = this.manager;
|
|
2370
|
-
const jobId = await manager.enqueueJob(library, internalVersion, {
|
|
2371
|
-
url,
|
|
2372
|
-
library,
|
|
2373
|
-
version: internalVersion,
|
|
2374
|
-
scope: scraperOptions?.scope ?? "subpages",
|
|
2375
|
-
followRedirects: scraperOptions?.followRedirects ?? true,
|
|
2376
|
-
maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES$1,
|
|
2377
|
-
maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
|
|
2378
|
-
maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
|
|
2379
|
-
ignoreErrors: scraperOptions?.ignoreErrors ?? true,
|
|
2380
|
-
scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto
|
|
2381
|
-
// Pass scrapeMode enum
|
|
2382
|
-
});
|
|
2383
|
-
if (waitForCompletion) {
|
|
2384
|
-
try {
|
|
2385
|
-
await manager.waitForJobCompletion(jobId);
|
|
2386
|
-
const finalJob = await manager.getJob(jobId);
|
|
2387
|
-
const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
|
|
2388
|
-
logger.debug(
|
|
2389
|
-
`Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
|
|
2390
|
-
);
|
|
2391
|
-
return {
|
|
2392
|
-
pagesScraped: finalPagesScraped
|
|
2393
|
-
};
|
|
2394
|
-
} catch (error) {
|
|
2395
|
-
logger.error(`Job ${jobId} failed or was cancelled: ${error}`);
|
|
2396
|
-
throw error;
|
|
2397
|
-
}
|
|
2398
|
-
}
|
|
2399
|
-
return { jobId };
|
|
2400
|
-
}
|
|
2401
|
-
}
|
|
2402
|
-
class SearchTool {
|
|
2403
|
-
docService;
|
|
2404
|
-
constructor(docService) {
|
|
2405
|
-
this.docService = docService;
|
|
2406
|
-
}
|
|
2407
|
-
async execute(options) {
|
|
2408
|
-
const { library, version, query, limit = 5, exactMatch = false } = options;
|
|
2409
|
-
if (exactMatch && (!version || version === "latest")) {
|
|
2410
|
-
await this.docService.validateLibraryExists(library);
|
|
2411
|
-
const allLibraries = await this.docService.listLibraries();
|
|
2412
|
-
const libraryInfo = allLibraries.find((lib) => lib.library === library);
|
|
2413
|
-
const detailedVersions = libraryInfo ? libraryInfo.versions : [];
|
|
2414
|
-
throw new VersionNotFoundError(
|
|
2415
|
-
library,
|
|
2416
|
-
"latest",
|
|
2417
|
-
// Or perhaps the original 'version' if it wasn't 'latest'? Check logic.
|
|
2418
|
-
detailedVersions
|
|
2419
|
-
);
|
|
2420
|
-
}
|
|
2421
|
-
const resolvedVersion = version || "latest";
|
|
2422
|
-
logger.info(
|
|
2423
|
-
`🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
|
|
2424
|
-
);
|
|
2425
|
-
try {
|
|
2426
|
-
await this.docService.validateLibraryExists(library);
|
|
2427
|
-
let versionToSearch = resolvedVersion;
|
|
2428
|
-
if (!exactMatch) {
|
|
2429
|
-
const versionResult = await this.docService.findBestVersion(library, version);
|
|
2430
|
-
versionToSearch = versionResult.bestMatch;
|
|
2431
|
-
}
|
|
2432
|
-
const results = await this.docService.searchStore(
|
|
2433
|
-
library,
|
|
2434
|
-
versionToSearch,
|
|
2435
|
-
query,
|
|
2436
|
-
limit
|
|
2437
|
-
);
|
|
2438
|
-
logger.info(`✅ Found ${results.length} matching results`);
|
|
2439
|
-
return { results };
|
|
2440
|
-
} catch (error) {
|
|
2441
|
-
logger.error(
|
|
2442
|
-
`❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
2443
|
-
);
|
|
2444
|
-
throw error;
|
|
2445
|
-
}
|
|
2446
|
-
}
|
|
2447
|
-
}
|
|
2448
|
-
let projectRoot = null;
|
|
2449
|
-
function getProjectRoot() {
|
|
2450
|
-
if (projectRoot) {
|
|
2451
|
-
return projectRoot;
|
|
2452
|
-
}
|
|
2453
|
-
const currentFilePath = fileURLToPath(import.meta.url);
|
|
2454
|
-
let currentDir = path.dirname(currentFilePath);
|
|
2455
|
-
while (true) {
|
|
2456
|
-
const packageJsonPath = path.join(currentDir, "package.json");
|
|
2457
|
-
if (fs$1.existsSync(packageJsonPath)) {
|
|
2458
|
-
projectRoot = currentDir;
|
|
2459
|
-
return projectRoot;
|
|
2460
|
-
}
|
|
2461
|
-
const parentDir = path.dirname(currentDir);
|
|
2462
|
-
if (parentDir === currentDir) {
|
|
2463
|
-
throw new Error("Could not find project root containing package.json.");
|
|
2464
|
-
}
|
|
2465
|
-
currentDir = parentDir;
|
|
2466
|
-
}
|
|
2467
|
-
}
|
|
2468
|
-
const CHILD_LIMIT = 5;
|
|
2469
|
-
const SIBLING_LIMIT = 2;
|
|
2470
|
-
class DocumentRetrieverService {
|
|
2471
|
-
documentStore;
|
|
2472
|
-
constructor(documentStore) {
|
|
2473
|
-
this.documentStore = documentStore;
|
|
2474
|
-
}
|
|
2475
|
-
/**
|
|
2476
|
-
* Collects all related chunk IDs for a given initial hit.
|
|
2477
|
-
* Returns an object with url, hitId, relatedIds (Set), and score.
|
|
2478
|
-
*/
|
|
2479
|
-
async getRelatedChunkIds(library, version, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
|
|
2480
|
-
const id = doc.id;
|
|
2481
|
-
const url = doc.metadata.url;
|
|
2482
|
-
const score = doc.metadata.score;
|
|
2483
|
-
const relatedIds = /* @__PURE__ */ new Set();
|
|
2484
|
-
relatedIds.add(id);
|
|
2485
|
-
const parent = await this.documentStore.findParentChunk(library, version, id);
|
|
2486
|
-
if (parent) {
|
|
2487
|
-
relatedIds.add(parent.id);
|
|
2488
|
-
}
|
|
2489
|
-
const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
|
|
2490
|
-
library,
|
|
2491
|
-
version,
|
|
2492
|
-
id,
|
|
2493
|
-
siblingLimit
|
|
2494
|
-
);
|
|
2495
|
-
for (const sib of precedingSiblings) {
|
|
2496
|
-
relatedIds.add(sib.id);
|
|
2497
|
-
}
|
|
2498
|
-
const childChunks = await this.documentStore.findChildChunks(
|
|
2499
|
-
library,
|
|
2500
|
-
version,
|
|
2501
|
-
id,
|
|
2502
|
-
childLimit
|
|
2503
|
-
);
|
|
2504
|
-
for (const child of childChunks) {
|
|
2505
|
-
relatedIds.add(child.id);
|
|
2506
|
-
}
|
|
2507
|
-
const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
|
|
2508
|
-
library,
|
|
2509
|
-
version,
|
|
2510
|
-
id,
|
|
2511
|
-
siblingLimit
|
|
2512
|
-
);
|
|
2513
|
-
for (const sib of subsequentSiblings) {
|
|
2514
|
-
relatedIds.add(sib.id);
|
|
2515
|
-
}
|
|
2516
|
-
return { url, hitId: id, relatedIds, score };
|
|
2517
|
-
}
|
|
2518
|
-
/**
|
|
2519
|
-
* Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
|
|
2520
|
-
*/
|
|
2521
|
-
groupAndPrepareFetch(relatedInfos) {
|
|
2522
|
-
const urlMap = /* @__PURE__ */ new Map();
|
|
2523
|
-
for (const info of relatedInfos) {
|
|
2524
|
-
let entry = urlMap.get(info.url);
|
|
2525
|
-
if (!entry) {
|
|
2526
|
-
entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
|
|
2527
|
-
urlMap.set(info.url, entry);
|
|
2528
|
-
}
|
|
2529
|
-
for (const id of info.relatedIds) {
|
|
2530
|
-
entry.uniqueChunkIds.add(id);
|
|
2531
|
-
}
|
|
2532
|
-
if (info.score > entry.maxScore) {
|
|
2533
|
-
entry.maxScore = info.score;
|
|
2534
|
-
}
|
|
2535
|
-
}
|
|
2536
|
-
return urlMap;
|
|
2537
|
-
}
|
|
2538
|
-
/**
|
|
2539
|
-
* Finalizes the merged result for a URL group by fetching, sorting, and joining content.
|
|
2540
|
-
*/
|
|
2541
|
-
async finalizeResult(library, version, url, uniqueChunkIds, maxScore) {
|
|
2542
|
-
const ids = Array.from(uniqueChunkIds);
|
|
2543
|
-
const docs = await this.documentStore.findChunksByIds(library, version, ids);
|
|
2544
|
-
const content = docs.map((d) => d.pageContent).join("\n\n");
|
|
2545
|
-
return {
|
|
2546
|
-
url,
|
|
2547
|
-
content,
|
|
2548
|
-
score: maxScore
|
|
2549
|
-
};
|
|
2550
|
-
}
|
|
2551
|
-
/**
|
|
2552
|
-
* Searches for documents and expands the context around the matches.
|
|
2553
|
-
* @param library The library name.
|
|
2554
|
-
* @param version The library version.
|
|
2555
|
-
* @param query The search query.
|
|
2556
|
-
* @param version The library version (optional, defaults to searching documents without a version).
|
|
2557
|
-
* @param query The search query.
|
|
2558
|
-
* @param limit The optional limit for the initial search results.
|
|
2559
|
-
* @returns An array of strings representing the aggregated content of the retrieved chunks.
|
|
2560
|
-
*/
|
|
2561
|
-
async search(library, version, query, limit) {
|
|
2562
|
-
const normalizedVersion = (version ?? "").toLowerCase();
|
|
2563
|
-
const initialResults = await this.documentStore.findByContent(
|
|
2564
|
-
library,
|
|
2565
|
-
normalizedVersion,
|
|
2566
|
-
query,
|
|
2567
|
-
limit ?? 10
|
|
2568
|
-
);
|
|
2569
|
-
const relatedInfos = await Promise.all(
|
|
2570
|
-
initialResults.map(
|
|
2571
|
-
(doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
|
|
2572
|
-
)
|
|
2573
|
-
);
|
|
2574
|
-
const urlMap = this.groupAndPrepareFetch(relatedInfos);
|
|
2575
|
-
const results = [];
|
|
2576
|
-
for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
|
|
2577
|
-
const result = await this.finalizeResult(
|
|
2578
|
-
library,
|
|
2579
|
-
normalizedVersion,
|
|
2580
|
-
url,
|
|
2581
|
-
uniqueChunkIds,
|
|
2582
|
-
maxScore
|
|
2583
|
-
);
|
|
2584
|
-
results.push(result);
|
|
2585
|
-
}
|
|
2586
|
-
return results;
|
|
2587
|
-
}
|
|
2588
|
-
}
|
|
2589
|
-
class StoreError extends Error {
|
|
2590
|
-
constructor(message, cause) {
|
|
2591
|
-
super(cause ? `${message} caused by ${cause}` : message);
|
|
2592
|
-
this.cause = cause;
|
|
2593
|
-
this.name = this.constructor.name;
|
|
2594
|
-
const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
|
|
2595
|
-
if (causeError?.stack) {
|
|
2596
|
-
this.stack = causeError.stack;
|
|
2597
|
-
}
|
|
2598
|
-
}
|
|
2599
|
-
}
|
|
2600
|
-
class DimensionError extends StoreError {
|
|
2601
|
-
constructor(modelName, modelDimension, dbDimension) {
|
|
2602
|
-
super(
|
|
2603
|
-
`Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
|
|
2604
|
-
);
|
|
2605
|
-
this.modelName = modelName;
|
|
2606
|
-
this.modelDimension = modelDimension;
|
|
2607
|
-
this.dbDimension = dbDimension;
|
|
2608
|
-
}
|
|
2609
|
-
}
|
|
2610
|
-
class ConnectionError extends StoreError {
|
|
2611
|
-
}
|
|
2612
|
-
const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
|
|
2613
|
-
const MIGRATIONS_TABLE = "_schema_migrations";
|
|
2614
|
-
function ensureMigrationsTable(db) {
|
|
2615
|
-
db.exec(`
|
|
2616
|
-
CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
|
|
2617
|
-
id TEXT PRIMARY KEY,
|
|
2618
|
-
applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
2619
|
-
);
|
|
2620
|
-
`);
|
|
2621
|
-
}
|
|
2622
|
-
function getAppliedMigrations(db) {
|
|
2623
|
-
const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
|
|
2624
|
-
const rows = stmt.all();
|
|
2625
|
-
return new Set(rows.map((row) => row.id));
|
|
2626
|
-
}
|
|
2627
|
-
function applyMigrations(db) {
|
|
2628
|
-
try {
|
|
2629
|
-
logger.debug("Applying database migrations...");
|
|
2630
|
-
ensureMigrationsTable(db);
|
|
2631
|
-
const appliedMigrations = getAppliedMigrations(db);
|
|
2632
|
-
if (!fs$1.existsSync(MIGRATIONS_DIR)) {
|
|
2633
|
-
throw new StoreError("Migrations directory not found");
|
|
2634
|
-
}
|
|
2635
|
-
const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
|
|
2636
|
-
let appliedCount = 0;
|
|
2637
|
-
for (const filename of migrationFiles) {
|
|
2638
|
-
if (!appliedMigrations.has(filename)) {
|
|
2639
|
-
logger.debug(`Applying migration: ${filename}`);
|
|
2640
|
-
const filePath = path.join(MIGRATIONS_DIR, filename);
|
|
2641
|
-
const sql = fs$1.readFileSync(filePath, "utf8");
|
|
2642
|
-
const transaction = db.transaction(() => {
|
|
2643
|
-
db.exec(sql);
|
|
2644
|
-
const insertStmt = db.prepare(
|
|
2645
|
-
`INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`
|
|
2646
|
-
);
|
|
2647
|
-
insertStmt.run(filename);
|
|
2648
|
-
});
|
|
2649
|
-
try {
|
|
2650
|
-
transaction();
|
|
2651
|
-
logger.debug(`Successfully applied migration: ${filename}`);
|
|
2652
|
-
appliedCount++;
|
|
2653
|
-
} catch (error) {
|
|
2654
|
-
logger.error(`Failed to apply migration: ${filename} - ${error}`);
|
|
2655
|
-
throw new StoreError(`Migration failed: ${filename} - ${error}`);
|
|
2656
|
-
}
|
|
2657
|
-
}
|
|
2658
|
-
}
|
|
2659
|
-
if (appliedCount > 0) {
|
|
2660
|
-
logger.debug(`Applied ${appliedCount} new migration(s).`);
|
|
2661
|
-
} else {
|
|
2662
|
-
logger.debug("Database schema is up to date.");
|
|
2663
|
-
}
|
|
2664
|
-
} catch (error) {
|
|
2665
|
-
if (error instanceof StoreError) {
|
|
2666
|
-
throw error;
|
|
2667
|
-
}
|
|
2668
|
-
throw new StoreError("Failed during migration process", error);
|
|
2669
|
-
}
|
|
2670
|
-
}
|
|
2671
|
-
const VECTOR_DIMENSION = 1536;
|
|
2672
|
-
function mapDbDocumentToDocument(doc) {
|
|
2673
|
-
return {
|
|
2674
|
-
id: doc.id,
|
|
2675
|
-
pageContent: doc.content,
|
|
2676
|
-
metadata: JSON.parse(doc.metadata)
|
|
2677
|
-
};
|
|
2678
|
-
}
|
|
2679
|
-
class DocumentStore {
|
|
2680
|
-
db;
|
|
2681
|
-
embeddings;
|
|
2682
|
-
dbDimension = VECTOR_DIMENSION;
|
|
2683
|
-
modelDimension;
|
|
2684
|
-
statements;
|
|
2685
|
-
/**
|
|
2686
|
-
* Calculates Reciprocal Rank Fusion score for a result
|
|
2687
|
-
*/
|
|
2688
|
-
calculateRRF(vecRank, ftsRank, k = 60) {
|
|
2689
|
-
let rrf = 0;
|
|
2690
|
-
if (vecRank !== void 0) {
|
|
2691
|
-
rrf += 1 / (k + vecRank);
|
|
2692
|
-
}
|
|
2693
|
-
if (ftsRank !== void 0) {
|
|
2694
|
-
rrf += 1 / (k + ftsRank);
|
|
2695
|
-
}
|
|
2696
|
-
return rrf;
|
|
2697
|
-
}
|
|
2698
|
-
/**
|
|
2699
|
-
* Assigns ranks to search results based on their scores
|
|
2700
|
-
*/
|
|
2701
|
-
assignRanks(results) {
|
|
2702
|
-
const vecRanks = /* @__PURE__ */ new Map();
|
|
2703
|
-
const ftsRanks = /* @__PURE__ */ new Map();
|
|
2704
|
-
results.filter((r) => r.vec_score !== void 0).sort((a, b) => (a.vec_score ?? 0) - (b.vec_score ?? 0)).forEach((result, index) => {
|
|
2705
|
-
vecRanks.set(Number(result.id), index + 1);
|
|
2706
|
-
});
|
|
2707
|
-
results.filter((r) => r.fts_score !== void 0).sort((a, b) => (a.fts_score ?? 0) - (b.fts_score ?? 0)).forEach((result, index) => {
|
|
2708
|
-
ftsRanks.set(Number(result.id), index + 1);
|
|
2709
|
-
});
|
|
2710
|
-
return results.map((result) => ({
|
|
2711
|
-
...result,
|
|
2712
|
-
vec_rank: vecRanks.get(Number(result.id)),
|
|
2713
|
-
fts_rank: ftsRanks.get(Number(result.id)),
|
|
2714
|
-
rrf_score: this.calculateRRF(
|
|
2715
|
-
vecRanks.get(Number(result.id)),
|
|
2716
|
-
ftsRanks.get(Number(result.id))
|
|
2717
|
-
)
|
|
2718
|
-
}));
|
|
2719
|
-
}
|
|
2720
|
-
constructor(dbPath) {
|
|
2721
|
-
if (!dbPath) {
|
|
2722
|
-
throw new StoreError("Missing required database path");
|
|
2723
|
-
}
|
|
2724
|
-
this.db = new Database(dbPath);
|
|
2725
|
-
}
|
|
2726
|
-
/**
|
|
2727
|
-
* Sets up prepared statements for database queries
|
|
2728
|
-
*/
|
|
2729
|
-
prepareStatements() {
|
|
2730
|
-
const statements = {
|
|
2731
|
-
getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
|
|
2732
|
-
insertDocument: this.db.prepare(
|
|
2733
|
-
"INSERT INTO documents (library, version, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
2734
|
-
// Added indexed_at
|
|
2735
|
-
),
|
|
2736
|
-
insertEmbedding: this.db.prepare(
|
|
2737
|
-
"INSERT INTO documents_vec (rowid, library, version, embedding) VALUES (?, ?, ?, ?)"
|
|
2738
|
-
),
|
|
2739
|
-
deleteDocuments: this.db.prepare(
|
|
2740
|
-
"DELETE FROM documents WHERE library = ? AND version = ?"
|
|
2741
|
-
),
|
|
2742
|
-
queryVersions: this.db.prepare(
|
|
2743
|
-
"SELECT DISTINCT version FROM documents WHERE library = ? ORDER BY version"
|
|
2744
|
-
),
|
|
2745
|
-
checkExists: this.db.prepare(
|
|
2746
|
-
"SELECT id FROM documents WHERE library = ? AND version = ? LIMIT 1"
|
|
2747
|
-
),
|
|
2748
|
-
queryLibraryVersions: this.db.prepare(
|
|
2749
|
-
`SELECT
|
|
2750
|
-
library,
|
|
2751
|
-
version,
|
|
2752
|
-
COUNT(*) as documentCount,
|
|
2753
|
-
COUNT(DISTINCT url) as uniqueUrlCount,
|
|
2754
|
-
MIN(indexed_at) as indexedAt
|
|
2755
|
-
FROM documents
|
|
2756
|
-
GROUP BY library, version
|
|
2757
|
-
ORDER BY library, version`
|
|
2758
|
-
),
|
|
2759
|
-
getChildChunks: this.db.prepare(`
|
|
2760
|
-
SELECT * FROM documents
|
|
2761
|
-
WHERE library = ?
|
|
2762
|
-
AND version = ?
|
|
2763
|
-
AND url = ?
|
|
2764
|
-
AND json_array_length(json_extract(metadata, '$.path')) = ?
|
|
2765
|
-
AND json_extract(metadata, '$.path') LIKE ? || '%'
|
|
2766
|
-
AND sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
2767
|
-
ORDER BY sort_order
|
|
2768
|
-
LIMIT ?
|
|
2769
|
-
`),
|
|
2770
|
-
getPrecedingSiblings: this.db.prepare(`
|
|
2771
|
-
SELECT * FROM documents
|
|
2772
|
-
WHERE library = ?
|
|
2773
|
-
AND version = ?
|
|
2774
|
-
AND url = ?
|
|
2775
|
-
AND sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
2776
|
-
AND json_extract(metadata, '$.path') = ?
|
|
2777
|
-
ORDER BY sort_order DESC
|
|
2778
|
-
LIMIT ?
|
|
2779
|
-
`),
|
|
2780
|
-
getSubsequentSiblings: this.db.prepare(`
|
|
2781
|
-
SELECT * FROM documents
|
|
2782
|
-
WHERE library = ?
|
|
2783
|
-
AND version = ?
|
|
2784
|
-
AND url = ?
|
|
2785
|
-
AND sort_order > (SELECT sort_order FROM documents WHERE id = ?)
|
|
2786
|
-
AND json_extract(metadata, '$.path') = ?
|
|
2787
|
-
ORDER BY sort_order
|
|
2788
|
-
LIMIT ?
|
|
2789
|
-
`),
|
|
2790
|
-
getParentChunk: this.db.prepare(`
|
|
2791
|
-
SELECT * FROM documents
|
|
2792
|
-
WHERE library = ?
|
|
2793
|
-
AND version = ?
|
|
2794
|
-
AND url = ?
|
|
2795
|
-
AND json_extract(metadata, '$.path') = ?
|
|
2796
|
-
AND sort_order < (SELECT sort_order FROM documents WHERE id = ?)
|
|
2797
|
-
ORDER BY sort_order DESC
|
|
2798
|
-
LIMIT 1
|
|
2799
|
-
`)
|
|
2800
|
-
};
|
|
2801
|
-
this.statements = statements;
|
|
2802
|
-
}
|
|
2803
|
-
/**
|
|
2804
|
-
* Pads a vector to the fixed database dimension by appending zeros.
|
|
2805
|
-
* Throws an error if the input vector is longer than the database dimension.
|
|
2806
|
-
*/
|
|
2807
|
-
padVector(vector) {
|
|
2808
|
-
if (vector.length > this.dbDimension) {
|
|
2809
|
-
throw new Error(
|
|
2810
|
-
`Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
|
|
2811
|
-
);
|
|
2812
|
-
}
|
|
2813
|
-
if (vector.length === this.dbDimension) {
|
|
2814
|
-
return vector;
|
|
2815
|
-
}
|
|
2816
|
-
return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
|
|
2817
|
-
}
|
|
2818
|
-
/**
|
|
2819
|
-
* Initializes embeddings client using environment variables for configuration.
|
|
2820
|
-
*
|
|
2821
|
-
* The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
|
|
2822
|
-
* Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
|
|
2823
|
-
* for OpenAI (default).
|
|
2824
|
-
*
|
|
2825
|
-
* Supported providers and their required environment variables:
|
|
2826
|
-
* - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
|
|
2827
|
-
* - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
|
|
2828
|
-
* - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
|
|
2829
|
-
* - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
|
|
2830
|
-
*/
|
|
2831
|
-
async initializeEmbeddings() {
|
|
2832
|
-
const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
|
|
2833
|
-
const { createEmbeddingModel } = await import("./EmbeddingFactory-0Z5e_g1J.js");
|
|
2834
|
-
this.embeddings = createEmbeddingModel(modelSpec);
|
|
2835
|
-
const testVector = await this.embeddings.embedQuery("test");
|
|
2836
|
-
this.modelDimension = testVector.length;
|
|
2837
|
-
if (this.modelDimension > this.dbDimension) {
|
|
2838
|
-
throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
|
|
2839
|
-
}
|
|
2840
|
-
}
|
|
2841
|
-
/**
|
|
2842
|
-
* Escapes a query string for use with SQLite FTS5 MATCH operator.
|
|
2843
|
-
* Wraps the query in double quotes and escapes internal double quotes.
|
|
2844
|
-
*/
|
|
2845
|
-
escapeFtsQuery(query) {
|
|
2846
|
-
const escapedQuotes = query.replace(/"/g, '""');
|
|
2847
|
-
return `"${escapedQuotes}"`;
|
|
2848
|
-
}
|
|
2849
|
-
/**
|
|
2850
|
-
* Initializes database connection and ensures readiness
|
|
2851
|
-
*/
|
|
2852
|
-
async initialize() {
|
|
2853
|
-
try {
|
|
2854
|
-
sqliteVec.load(this.db);
|
|
2855
|
-
applyMigrations(this.db);
|
|
2856
|
-
this.prepareStatements();
|
|
2857
|
-
await this.initializeEmbeddings();
|
|
2858
|
-
} catch (error) {
|
|
2859
|
-
if (error instanceof StoreError) {
|
|
2860
|
-
throw error;
|
|
2861
|
-
}
|
|
2862
|
-
throw new ConnectionError("Failed to initialize database connection", error);
|
|
2863
|
-
}
|
|
2864
|
-
}
|
|
2865
|
-
/**
|
|
2866
|
-
* Gracefully closes database connections
|
|
2867
|
-
*/
|
|
2868
|
-
async shutdown() {
|
|
2869
|
-
this.db.close();
|
|
2870
|
-
}
|
|
2871
|
-
/**
|
|
2872
|
-
* Retrieves all unique versions for a specific library
|
|
2873
|
-
*/
|
|
2874
|
-
async queryUniqueVersions(library) {
|
|
2875
|
-
try {
|
|
2876
|
-
const rows = this.statements.queryVersions.all(library.toLowerCase());
|
|
2877
|
-
return rows.map((row) => row.version);
|
|
2878
|
-
} catch (error) {
|
|
2879
|
-
throw new ConnectionError("Failed to query versions", error);
|
|
2880
|
-
}
|
|
2881
|
-
}
|
|
2882
|
-
/**
|
|
2883
|
-
* Verifies existence of documents for a specific library version
|
|
2884
|
-
*/
|
|
2885
|
-
async checkDocumentExists(library, version) {
|
|
2886
|
-
try {
|
|
2887
|
-
const result = this.statements.checkExists.get(
|
|
2888
|
-
library.toLowerCase(),
|
|
2889
|
-
version.toLowerCase()
|
|
2890
|
-
);
|
|
2891
|
-
return result !== void 0;
|
|
2892
|
-
} catch (error) {
|
|
2893
|
-
throw new ConnectionError("Failed to check document existence", error);
|
|
2894
|
-
}
|
|
2895
|
-
}
|
|
2896
|
-
/**
|
|
2897
|
-
* Retrieves a mapping of all libraries to their available versions with details.
|
|
2898
|
-
*/
|
|
2899
|
-
async queryLibraryVersions() {
|
|
2900
|
-
try {
|
|
2901
|
-
const rows = this.statements.queryLibraryVersions.all();
|
|
2902
|
-
const libraryMap = /* @__PURE__ */ new Map();
|
|
2903
|
-
for (const row of rows) {
|
|
2904
|
-
const library = row.library;
|
|
2905
|
-
if (!libraryMap.has(library)) {
|
|
2906
|
-
libraryMap.set(library, []);
|
|
2907
|
-
}
|
|
2908
|
-
const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
|
|
2909
|
-
libraryMap.get(library)?.push({
|
|
2910
|
-
version: row.version,
|
|
2911
|
-
documentCount: row.documentCount,
|
|
2912
|
-
uniqueUrlCount: row.uniqueUrlCount,
|
|
2913
|
-
indexedAt: indexedAtISO
|
|
2914
|
-
});
|
|
2915
|
-
}
|
|
2916
|
-
for (const versions of libraryMap.values()) {
|
|
2917
|
-
versions.sort((a, b) => {
|
|
2918
|
-
if (a.version === "" && b.version !== "") {
|
|
2919
|
-
return -1;
|
|
2920
|
-
}
|
|
2921
|
-
if (a.version !== "" && b.version === "") {
|
|
2922
|
-
return 1;
|
|
2923
|
-
}
|
|
2924
|
-
if (a.version === "" && b.version === "") {
|
|
2925
|
-
return 0;
|
|
2926
|
-
}
|
|
2927
|
-
return semver__default.compare(a.version, b.version);
|
|
2928
|
-
});
|
|
2929
|
-
}
|
|
2930
|
-
return libraryMap;
|
|
2931
|
-
} catch (error) {
|
|
2932
|
-
throw new ConnectionError("Failed to query library versions", error);
|
|
2933
|
-
}
|
|
2934
|
-
}
|
|
2935
|
-
/**
|
|
2936
|
-
* Stores documents with library and version metadata, generating embeddings
|
|
2937
|
-
* for vector similarity search
|
|
2938
|
-
*/
|
|
2939
|
-
async addDocuments(library, version, documents) {
|
|
2940
|
-
try {
|
|
2941
|
-
const texts = documents.map((doc) => {
|
|
2942
|
-
const header = `<title>${doc.metadata.title}</title>
|
|
2943
|
-
<url>${doc.metadata.url}</url>
|
|
2944
|
-
<path>${doc.metadata.path.join(" / ")}</path>
|
|
2945
|
-
`;
|
|
2946
|
-
return `${header}${doc.pageContent}`;
|
|
2947
|
-
});
|
|
2948
|
-
const rawEmbeddings = [];
|
|
2949
|
-
for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
|
|
2950
|
-
const batchTexts = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
|
|
2951
|
-
const batchEmbeddings = await this.embeddings.embedDocuments(batchTexts);
|
|
2952
|
-
rawEmbeddings.push(...batchEmbeddings);
|
|
2953
|
-
}
|
|
2954
|
-
const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
|
|
2955
|
-
const transaction = this.db.transaction((docs) => {
|
|
2956
|
-
for (let i = 0; i < docs.length; i++) {
|
|
2957
|
-
const doc = docs[i];
|
|
2958
|
-
const url = doc.metadata.url;
|
|
2959
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
2960
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
2961
|
-
}
|
|
2962
|
-
const result = this.statements.insertDocument.run(
|
|
2963
|
-
library.toLowerCase(),
|
|
2964
|
-
version.toLowerCase(),
|
|
2965
|
-
url,
|
|
2966
|
-
doc.pageContent,
|
|
2967
|
-
JSON.stringify(doc.metadata),
|
|
2968
|
-
i,
|
|
2969
|
-
(/* @__PURE__ */ new Date()).toISOString()
|
|
2970
|
-
// Pass current timestamp for indexed_at
|
|
2971
|
-
);
|
|
2972
|
-
const rowId = result.lastInsertRowid;
|
|
2973
|
-
this.statements.insertEmbedding.run(
|
|
2974
|
-
BigInt(rowId),
|
|
2975
|
-
library.toLowerCase(),
|
|
2976
|
-
version.toLowerCase(),
|
|
2977
|
-
JSON.stringify(paddedEmbeddings[i])
|
|
2978
|
-
);
|
|
2979
|
-
}
|
|
2980
|
-
});
|
|
2981
|
-
transaction(documents);
|
|
2982
|
-
} catch (error) {
|
|
2983
|
-
throw new ConnectionError("Failed to add documents to store", error);
|
|
2984
|
-
}
|
|
2985
|
-
}
|
|
2986
|
-
/**
|
|
2987
|
-
* Removes documents matching specified library and version
|
|
2988
|
-
* @returns Number of documents deleted
|
|
2989
|
-
*/
|
|
2990
|
-
async deleteDocuments(library, version) {
|
|
2991
|
-
try {
|
|
2992
|
-
const result = this.statements.deleteDocuments.run(
|
|
2993
|
-
library.toLowerCase(),
|
|
2994
|
-
version.toLowerCase()
|
|
2995
|
-
);
|
|
2996
|
-
return result.changes;
|
|
2997
|
-
} catch (error) {
|
|
2998
|
-
throw new ConnectionError("Failed to delete documents", error);
|
|
2999
|
-
}
|
|
3000
|
-
}
|
|
3001
|
-
/**
|
|
3002
|
-
* Retrieves a document by its ID.
|
|
3003
|
-
* @param id The ID of the document.
|
|
3004
|
-
* @returns The document, or null if not found.
|
|
3005
|
-
*/
|
|
3006
|
-
async getById(id) {
|
|
3007
|
-
try {
|
|
3008
|
-
const row = this.statements.getById.get(id);
|
|
3009
|
-
if (!row) {
|
|
3010
|
-
return null;
|
|
3011
|
-
}
|
|
3012
|
-
return mapDbDocumentToDocument(row);
|
|
3013
|
-
} catch (error) {
|
|
3014
|
-
throw new ConnectionError(`Failed to get document by ID ${id}`, error);
|
|
3015
|
-
}
|
|
3016
|
-
}
|
|
3017
|
-
/**
|
|
3018
|
-
* Finds documents matching a text query using hybrid search.
|
|
3019
|
-
* Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
|
|
3020
|
-
*/
|
|
3021
|
-
async findByContent(library, version, query, limit) {
|
|
3022
|
-
try {
|
|
3023
|
-
const rawEmbedding = await this.embeddings.embedQuery(query);
|
|
3024
|
-
const embedding = this.padVector(rawEmbedding);
|
|
3025
|
-
const ftsQuery = this.escapeFtsQuery(query);
|
|
3026
|
-
const stmt = this.db.prepare(`
|
|
3027
|
-
WITH vec_scores AS (
|
|
3028
|
-
SELECT
|
|
3029
|
-
rowid as id,
|
|
3030
|
-
distance as vec_score
|
|
3031
|
-
FROM documents_vec
|
|
3032
|
-
WHERE library = ?
|
|
3033
|
-
AND version = ?
|
|
3034
|
-
AND embedding MATCH ?
|
|
3035
|
-
ORDER BY vec_score
|
|
3036
|
-
LIMIT ?
|
|
3037
|
-
),
|
|
3038
|
-
fts_scores AS (
|
|
3039
|
-
SELECT
|
|
3040
|
-
f.rowid as id,
|
|
3041
|
-
bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
|
|
3042
|
-
FROM documents_fts f
|
|
3043
|
-
JOIN documents d ON f.rowid = d.rowid
|
|
3044
|
-
WHERE d.library = ?
|
|
3045
|
-
AND d.version = ?
|
|
3046
|
-
AND documents_fts MATCH ?
|
|
3047
|
-
ORDER BY fts_score
|
|
3048
|
-
LIMIT ?
|
|
3049
|
-
)
|
|
3050
|
-
SELECT
|
|
3051
|
-
d.id,
|
|
3052
|
-
d.content,
|
|
3053
|
-
d.metadata,
|
|
3054
|
-
COALESCE(1 / (1 + v.vec_score), 0) as vec_score,
|
|
3055
|
-
COALESCE(1 / (1 + f.fts_score), 0) as fts_score
|
|
3056
|
-
FROM documents d
|
|
3057
|
-
LEFT JOIN vec_scores v ON d.id = v.id
|
|
3058
|
-
LEFT JOIN fts_scores f ON d.id = f.id
|
|
3059
|
-
WHERE v.id IS NOT NULL OR f.id IS NOT NULL
|
|
3060
|
-
`);
|
|
3061
|
-
const rawResults = stmt.all(
|
|
3062
|
-
library.toLowerCase(),
|
|
3063
|
-
version.toLowerCase(),
|
|
3064
|
-
JSON.stringify(embedding),
|
|
3065
|
-
limit,
|
|
3066
|
-
library.toLowerCase(),
|
|
3067
|
-
version.toLowerCase(),
|
|
3068
|
-
ftsQuery,
|
|
3069
|
-
// Use the escaped query
|
|
3070
|
-
limit
|
|
3071
|
-
);
|
|
3072
|
-
const rankedResults = this.assignRanks(rawResults);
|
|
3073
|
-
const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
|
|
3074
|
-
return topResults.map((row) => ({
|
|
3075
|
-
...mapDbDocumentToDocument(row),
|
|
3076
|
-
metadata: {
|
|
3077
|
-
...JSON.parse(row.metadata),
|
|
3078
|
-
score: row.rrf_score,
|
|
3079
|
-
vec_rank: row.vec_rank,
|
|
3080
|
-
fts_rank: row.fts_rank
|
|
3081
|
-
}
|
|
3082
|
-
}));
|
|
3083
|
-
} catch (error) {
|
|
3084
|
-
throw new ConnectionError(
|
|
3085
|
-
`Failed to find documents by content with query "${query}"`,
|
|
3086
|
-
error
|
|
3087
|
-
);
|
|
3088
|
-
}
|
|
3089
|
-
}
|
|
3090
|
-
/**
|
|
3091
|
-
* Finds child chunks of a given document based on path hierarchy.
|
|
3092
|
-
*/
|
|
3093
|
-
async findChildChunks(library, version, id, limit) {
|
|
3094
|
-
try {
|
|
3095
|
-
const parent = await this.getById(id);
|
|
3096
|
-
if (!parent) {
|
|
3097
|
-
return [];
|
|
3098
|
-
}
|
|
3099
|
-
const parentPath = parent.metadata.path ?? [];
|
|
3100
|
-
const parentUrl = parent.metadata.url;
|
|
3101
|
-
const result = this.statements.getChildChunks.all(
|
|
3102
|
-
library.toLowerCase(),
|
|
3103
|
-
version.toLowerCase(),
|
|
3104
|
-
parentUrl,
|
|
3105
|
-
parentPath.length + 1,
|
|
3106
|
-
JSON.stringify(parentPath),
|
|
3107
|
-
id,
|
|
3108
|
-
limit
|
|
3109
|
-
);
|
|
3110
|
-
return result.map((row) => mapDbDocumentToDocument(row));
|
|
3111
|
-
} catch (error) {
|
|
3112
|
-
throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
|
|
3113
|
-
}
|
|
3114
|
-
}
|
|
3115
|
-
/**
|
|
3116
|
-
* Finds preceding sibling chunks of a given document.
|
|
3117
|
-
*/
|
|
3118
|
-
async findPrecedingSiblingChunks(library, version, id, limit) {
|
|
3119
|
-
try {
|
|
3120
|
-
const reference = await this.getById(id);
|
|
3121
|
-
if (!reference) {
|
|
3122
|
-
return [];
|
|
3123
|
-
}
|
|
3124
|
-
const refMetadata = reference.metadata;
|
|
3125
|
-
const result = this.statements.getPrecedingSiblings.all(
|
|
3126
|
-
library.toLowerCase(),
|
|
3127
|
-
version.toLowerCase(),
|
|
3128
|
-
refMetadata.url,
|
|
3129
|
-
id,
|
|
3130
|
-
JSON.stringify(refMetadata.path),
|
|
3131
|
-
limit
|
|
3132
|
-
);
|
|
3133
|
-
return result.reverse().map((row) => mapDbDocumentToDocument(row));
|
|
3134
|
-
} catch (error) {
|
|
3135
|
-
throw new ConnectionError(
|
|
3136
|
-
`Failed to find preceding sibling chunks for ID ${id}`,
|
|
3137
|
-
error
|
|
3138
|
-
);
|
|
3139
|
-
}
|
|
3140
|
-
}
|
|
3141
|
-
/**
|
|
3142
|
-
* Finds subsequent sibling chunks of a given document.
|
|
3143
|
-
*/
|
|
3144
|
-
async findSubsequentSiblingChunks(library, version, id, limit) {
|
|
3145
|
-
try {
|
|
3146
|
-
const reference = await this.getById(id);
|
|
3147
|
-
if (!reference) {
|
|
3148
|
-
return [];
|
|
3149
|
-
}
|
|
3150
|
-
const refMetadata = reference.metadata;
|
|
3151
|
-
const result = this.statements.getSubsequentSiblings.all(
|
|
3152
|
-
library.toLowerCase(),
|
|
3153
|
-
version.toLowerCase(),
|
|
3154
|
-
refMetadata.url,
|
|
3155
|
-
id,
|
|
3156
|
-
JSON.stringify(refMetadata.path),
|
|
3157
|
-
limit
|
|
3158
|
-
);
|
|
3159
|
-
return result.map((row) => mapDbDocumentToDocument(row));
|
|
3160
|
-
} catch (error) {
|
|
3161
|
-
throw new ConnectionError(
|
|
3162
|
-
`Failed to find subsequent sibling chunks for ID ${id}`,
|
|
3163
|
-
error
|
|
3164
|
-
);
|
|
3165
|
-
}
|
|
3166
|
-
}
|
|
3167
|
-
/**
|
|
3168
|
-
* Finds the parent chunk of a given document.
|
|
3169
|
-
*/
|
|
3170
|
-
async findParentChunk(library, version, id) {
|
|
3171
|
-
try {
|
|
3172
|
-
const child = await this.getById(id);
|
|
3173
|
-
if (!child) {
|
|
3174
|
-
return null;
|
|
3175
|
-
}
|
|
3176
|
-
const childMetadata = child.metadata;
|
|
3177
|
-
const path2 = childMetadata.path ?? [];
|
|
3178
|
-
const parentPath = path2.slice(0, -1);
|
|
3179
|
-
if (parentPath.length === 0) {
|
|
3180
|
-
return null;
|
|
3181
|
-
}
|
|
3182
|
-
const result = this.statements.getParentChunk.get(
|
|
3183
|
-
library.toLowerCase(),
|
|
3184
|
-
version.toLowerCase(),
|
|
3185
|
-
childMetadata.url,
|
|
3186
|
-
JSON.stringify(parentPath),
|
|
3187
|
-
id
|
|
3188
|
-
);
|
|
3189
|
-
if (!result) {
|
|
3190
|
-
return null;
|
|
3191
|
-
}
|
|
3192
|
-
return mapDbDocumentToDocument(result);
|
|
3193
|
-
} catch (error) {
|
|
3194
|
-
throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
|
|
3195
|
-
}
|
|
3196
|
-
}
|
|
3197
|
-
/**
|
|
3198
|
-
* Fetches multiple documents by their IDs in a single call.
|
|
3199
|
-
* Returns an array of Document objects, sorted by their sort_order.
|
|
3200
|
-
*/
|
|
3201
|
-
async findChunksByIds(library, version, ids) {
|
|
3202
|
-
if (!ids.length) return [];
|
|
3203
|
-
try {
|
|
3204
|
-
const placeholders = ids.map(() => "?").join(",");
|
|
3205
|
-
const stmt = this.db.prepare(
|
|
3206
|
-
`SELECT * FROM documents WHERE library = ? AND version = ? AND id IN (${placeholders}) ORDER BY sort_order`
|
|
3207
|
-
);
|
|
3208
|
-
const rows = stmt.all(
|
|
3209
|
-
library.toLowerCase(),
|
|
3210
|
-
version.toLowerCase(),
|
|
3211
|
-
...ids
|
|
3212
|
-
);
|
|
3213
|
-
return rows.map((row) => mapDbDocumentToDocument(row));
|
|
3214
|
-
} catch (error) {
|
|
3215
|
-
throw new ConnectionError("Failed to fetch documents by IDs", error);
|
|
3216
|
-
}
|
|
3217
|
-
}
|
|
3218
|
-
}
|
|
3219
|
-
class DocumentManagementService {
|
|
3220
|
-
store;
|
|
3221
|
-
documentRetriever;
|
|
3222
|
-
splitter;
|
|
3223
|
-
/**
|
|
3224
|
-
* Normalizes a version string, converting null or undefined to an empty string
|
|
3225
|
-
* and converting to lowercase.
|
|
3226
|
-
*/
|
|
3227
|
-
normalizeVersion(version) {
|
|
3228
|
-
return (version ?? "").toLowerCase();
|
|
3229
|
-
}
|
|
3230
|
-
constructor() {
|
|
3231
|
-
let dbPath;
|
|
3232
|
-
let dbDir;
|
|
3233
|
-
const envStorePath = process.env.DOCS_MCP_STORE_PATH;
|
|
3234
|
-
if (envStorePath) {
|
|
3235
|
-
dbDir = envStorePath;
|
|
3236
|
-
dbPath = path.join(dbDir, "documents.db");
|
|
3237
|
-
logger.debug(`💾 Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
|
|
3238
|
-
} else {
|
|
3239
|
-
const projectRoot2 = getProjectRoot();
|
|
3240
|
-
const oldDbDir = path.join(projectRoot2, ".store");
|
|
3241
|
-
const oldDbPath = path.join(oldDbDir, "documents.db");
|
|
3242
|
-
const oldDbExists = fs$1.existsSync(oldDbPath);
|
|
3243
|
-
if (oldDbExists) {
|
|
3244
|
-
dbPath = oldDbPath;
|
|
3245
|
-
dbDir = oldDbDir;
|
|
3246
|
-
logger.debug(`💾 Using legacy database path: ${dbPath}`);
|
|
3247
|
-
} else {
|
|
3248
|
-
const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
|
|
3249
|
-
dbDir = standardPaths.data;
|
|
3250
|
-
dbPath = path.join(dbDir, "documents.db");
|
|
3251
|
-
logger.debug(`💾 Using standard database directory: ${dbDir}`);
|
|
3252
|
-
}
|
|
3253
|
-
}
|
|
3254
|
-
try {
|
|
3255
|
-
fs$1.mkdirSync(dbDir, { recursive: true });
|
|
3256
|
-
} catch (error) {
|
|
3257
|
-
logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
|
|
3258
|
-
}
|
|
3259
|
-
this.store = new DocumentStore(dbPath);
|
|
3260
|
-
this.documentRetriever = new DocumentRetrieverService(this.store);
|
|
3261
|
-
const semanticSplitter = new SemanticMarkdownSplitter(
|
|
3262
|
-
SPLITTER_PREFERRED_CHUNK_SIZE,
|
|
3263
|
-
SPLITTER_MAX_CHUNK_SIZE
|
|
3264
|
-
);
|
|
3265
|
-
const greedySplitter = new GreedySplitter(
|
|
3266
|
-
semanticSplitter,
|
|
3267
|
-
SPLITTER_MIN_CHUNK_SIZE,
|
|
3268
|
-
SPLITTER_PREFERRED_CHUNK_SIZE
|
|
3269
|
-
);
|
|
3270
|
-
this.splitter = greedySplitter;
|
|
3271
|
-
}
|
|
3272
|
-
/**
|
|
3273
|
-
* Initializes the underlying document store.
|
|
3274
|
-
*/
|
|
3275
|
-
async initialize() {
|
|
3276
|
-
await this.store.initialize();
|
|
3277
|
-
}
|
|
3278
|
-
/**
|
|
3279
|
-
* Shuts down the underlying document store.
|
|
3280
|
-
*/
|
|
3281
|
-
async shutdown() {
|
|
3282
|
-
logger.info("🔌 Shutting down store manager");
|
|
3283
|
-
await this.store.shutdown();
|
|
3284
|
-
}
|
|
3285
|
-
/**
|
|
3286
|
-
* Validates if a library exists in the store (either versioned or unversioned).
|
|
3287
|
-
* Throws LibraryNotFoundError with suggestions if the library is not found.
|
|
3288
|
-
* @param library The name of the library to validate.
|
|
3289
|
-
* @throws {LibraryNotFoundError} If the library does not exist.
|
|
3290
|
-
*/
|
|
3291
|
-
async validateLibraryExists(library) {
|
|
3292
|
-
logger.info(`🔎 Validating existence of library: ${library}`);
|
|
3293
|
-
const normalizedLibrary = library.toLowerCase();
|
|
3294
|
-
const versions = await this.listVersions(normalizedLibrary);
|
|
3295
|
-
const hasUnversioned = await this.exists(normalizedLibrary, "");
|
|
3296
|
-
if (versions.length === 0 && !hasUnversioned) {
|
|
3297
|
-
logger.warn(`⚠️ Library '${library}' not found.`);
|
|
3298
|
-
const allLibraries = await this.listLibraries();
|
|
3299
|
-
const libraryNames = allLibraries.map((lib) => lib.library);
|
|
3300
|
-
let suggestions = [];
|
|
3301
|
-
if (libraryNames.length > 0) {
|
|
3302
|
-
const fuse = new Fuse(libraryNames, {
|
|
3303
|
-
// Configure fuse.js options if needed (e.g., threshold)
|
|
3304
|
-
// isCaseSensitive: false, // Handled by normalizing library names
|
|
3305
|
-
// includeScore: true,
|
|
3306
|
-
threshold: 0.4
|
|
3307
|
-
// Adjust threshold for desired fuzziness (0=exact, 1=match anything)
|
|
3308
|
-
});
|
|
3309
|
-
const results = fuse.search(normalizedLibrary);
|
|
3310
|
-
suggestions = results.slice(0, 3).map((result) => result.item);
|
|
3311
|
-
logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
|
|
3312
|
-
}
|
|
3313
|
-
throw new LibraryNotFoundError(library, suggestions);
|
|
3314
|
-
}
|
|
3315
|
-
logger.info(`✅ Library '${library}' confirmed to exist.`);
|
|
3316
|
-
}
|
|
3317
|
-
/**
|
|
3318
|
-
* Returns a list of all available semantic versions for a library.
|
|
3319
|
-
*/
|
|
3320
|
-
async listVersions(library) {
|
|
3321
|
-
const versions = await this.store.queryUniqueVersions(library);
|
|
3322
|
-
return versions.filter((v) => semver__default.valid(v)).map((version) => ({ version }));
|
|
3323
|
-
}
|
|
3324
|
-
/**
|
|
3325
|
-
* Checks if documents exist for a given library and optional version.
|
|
3326
|
-
* If version is omitted, checks for documents without a specific version.
|
|
3327
|
-
*/
|
|
3328
|
-
async exists(library, version) {
|
|
3329
|
-
const normalizedVersion = this.normalizeVersion(version);
|
|
3330
|
-
return this.store.checkDocumentExists(library, normalizedVersion);
|
|
3331
|
-
}
|
|
3332
|
-
/**
|
|
3333
|
-
* Finds the most appropriate version of documentation based on the requested version.
|
|
3334
|
-
* When no target version is specified, returns the latest version.
|
|
3335
|
-
*
|
|
3336
|
-
* Version matching behavior:
|
|
3337
|
-
* - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
|
|
3338
|
-
* - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
|
|
3339
|
-
* - "latest" or no version: Returns the latest available version
|
|
3340
|
-
*
|
|
3341
|
-
* For documentation, we prefer matching older versions over no match at all,
|
|
3342
|
-
* since older docs are often still relevant and useful.
|
|
3343
|
-
* Also checks if unversioned documents exist for the library.
|
|
3344
|
-
*/
|
|
3345
|
-
async findBestVersion(library, targetVersion) {
|
|
3346
|
-
logger.info(
|
|
3347
|
-
`🔍 Finding best version for ${library}${targetVersion ? `@${targetVersion}` : ""}`
|
|
3348
|
-
);
|
|
3349
|
-
const hasUnversioned = await this.store.checkDocumentExists(library, "");
|
|
3350
|
-
const validSemverVersions = await this.listVersions(library);
|
|
3351
|
-
if (validSemverVersions.length === 0) {
|
|
3352
|
-
if (hasUnversioned) {
|
|
3353
|
-
logger.info(`ℹ️ Unversioned documents exist for ${library}`);
|
|
3354
|
-
return { bestMatch: null, hasUnversioned: true };
|
|
3355
|
-
}
|
|
3356
|
-
logger.warn(`⚠️ No valid versions found for ${library}`);
|
|
3357
|
-
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
3358
|
-
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
3359
|
-
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
3360
|
-
}
|
|
3361
|
-
const versionStrings = validSemverVersions.map((v) => v.version);
|
|
3362
|
-
let bestMatch = null;
|
|
3363
|
-
if (!targetVersion || targetVersion === "latest") {
|
|
3364
|
-
bestMatch = semver__default.maxSatisfying(versionStrings, "*");
|
|
3365
|
-
} else {
|
|
3366
|
-
const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
|
|
3367
|
-
if (!versionRegex.test(targetVersion)) {
|
|
3368
|
-
logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
|
|
3369
|
-
} else {
|
|
3370
|
-
let range = targetVersion;
|
|
3371
|
-
if (!semver__default.validRange(targetVersion)) {
|
|
3372
|
-
range = `~${targetVersion}`;
|
|
3373
|
-
} else if (semver__default.valid(targetVersion)) {
|
|
3374
|
-
range = `${range} || <=${targetVersion}`;
|
|
3375
|
-
}
|
|
3376
|
-
bestMatch = semver__default.maxSatisfying(versionStrings, range);
|
|
3377
|
-
}
|
|
3378
|
-
}
|
|
3379
|
-
if (bestMatch) {
|
|
3380
|
-
logger.info(
|
|
3381
|
-
`✅ Found best match version ${bestMatch} for ${library}@${targetVersion}`
|
|
3382
|
-
);
|
|
3383
|
-
} else {
|
|
3384
|
-
logger.warn(`⚠️ No matching semver version found for ${library}@${targetVersion}`);
|
|
3385
|
-
}
|
|
3386
|
-
if (!bestMatch && !hasUnversioned) {
|
|
3387
|
-
const allLibraryDetails = await this.store.queryLibraryVersions();
|
|
3388
|
-
const libraryDetails = allLibraryDetails.get(library) ?? [];
|
|
3389
|
-
throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
|
|
3390
|
-
}
|
|
3391
|
-
return { bestMatch, hasUnversioned };
|
|
3392
|
-
}
|
|
3393
|
-
/**
|
|
3394
|
-
* Removes all documents for a specific library and optional version.
|
|
3395
|
-
* If version is omitted, removes documents without a specific version.
|
|
3396
|
-
*/
|
|
3397
|
-
async removeAllDocuments(library, version) {
|
|
3398
|
-
const normalizedVersion = this.normalizeVersion(version);
|
|
3399
|
-
logger.info(
|
|
3400
|
-
`🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
|
|
3401
|
-
);
|
|
3402
|
-
const count = await this.store.deleteDocuments(library, normalizedVersion);
|
|
3403
|
-
logger.info(`📊 Deleted ${count} documents`);
|
|
3404
|
-
}
|
|
3405
|
-
/**
|
|
3406
|
-
* Adds a document to the store, splitting it into smaller chunks for better search results.
|
|
3407
|
-
* Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
|
|
3408
|
-
* Preserves hierarchical structure of documents and distinguishes between text and code segments.
|
|
3409
|
-
* If version is omitted, the document is added without a specific version.
|
|
3410
|
-
*/
|
|
3411
|
-
async addDocument(library, version, document) {
|
|
3412
|
-
const normalizedVersion = this.normalizeVersion(version);
|
|
3413
|
-
const url = document.metadata.url;
|
|
3414
|
-
if (!url || typeof url !== "string" || !url.trim()) {
|
|
3415
|
-
throw new StoreError("Document metadata must include a valid URL");
|
|
3416
|
-
}
|
|
3417
|
-
logger.info(`📚 Adding document: ${document.metadata.title}`);
|
|
3418
|
-
if (!document.pageContent.trim()) {
|
|
3419
|
-
throw new Error("Document content cannot be empty");
|
|
3420
|
-
}
|
|
3421
|
-
const chunks = await this.splitter.splitText(document.pageContent);
|
|
3422
|
-
const splitDocs = chunks.map((chunk) => ({
|
|
3423
|
-
pageContent: chunk.content,
|
|
3424
|
-
metadata: {
|
|
3425
|
-
...document.metadata,
|
|
3426
|
-
level: chunk.section.level,
|
|
3427
|
-
path: chunk.section.path
|
|
3428
|
-
}
|
|
3429
|
-
}));
|
|
3430
|
-
logger.info(`📄 Split document into ${splitDocs.length} chunks`);
|
|
3431
|
-
await this.store.addDocuments(library, normalizedVersion, splitDocs);
|
|
3432
|
-
}
|
|
3433
|
-
/**
|
|
3434
|
-
* Searches for documentation content across versions.
|
|
3435
|
-
* Uses hybrid search (vector + FTS).
|
|
3436
|
-
* If version is omitted, searches documents without a specific version.
|
|
3437
|
-
*/
|
|
3438
|
-
async searchStore(library, version, query, limit = 5) {
|
|
3439
|
-
const normalizedVersion = this.normalizeVersion(version);
|
|
3440
|
-
return this.documentRetriever.search(library, normalizedVersion, query, limit);
|
|
3441
|
-
}
|
|
3442
|
-
async listLibraries() {
|
|
3443
|
-
const libraryMap = await this.store.queryLibraryVersions();
|
|
3444
|
-
return Array.from(libraryMap.entries()).map(([library, versions]) => ({
|
|
3445
|
-
library,
|
|
3446
|
-
versions
|
|
3447
|
-
// The versions array already contains LibraryVersionDetails
|
|
3448
|
-
}));
|
|
3449
|
-
}
|
|
3450
|
-
}
|
|
3451
|
-
export {
|
|
3452
|
-
DocumentManagementService as D,
|
|
3453
|
-
FileFetcher as F,
|
|
3454
|
-
HttpFetcher as H,
|
|
3455
|
-
LibraryNotFoundError as L,
|
|
3456
|
-
MarkdownPipeline as M,
|
|
3457
|
-
PipelineJobStatus as P,
|
|
3458
|
-
SearchTool as S,
|
|
3459
|
-
ToolError as T,
|
|
3460
|
-
VersionNotFoundError as V,
|
|
3461
|
-
PipelineManager as a,
|
|
3462
|
-
DEFAULT_MAX_DEPTH$1 as b,
|
|
3463
|
-
DEFAULT_MAX_PAGES$1 as c,
|
|
3464
|
-
LogLevel as d,
|
|
3465
|
-
ScrapeTool as e,
|
|
3466
|
-
ListLibrariesTool as f,
|
|
3467
|
-
DEFAULT_PROTOCOL as g,
|
|
3468
|
-
DEFAULT_HTTP_PORT as h,
|
|
3469
|
-
DEFAULT_MAX_CONCURRENCY as i,
|
|
3470
|
-
ScrapeMode as j,
|
|
3471
|
-
HtmlPipeline as k,
|
|
3472
|
-
logger as l,
|
|
3473
|
-
ScraperError as m,
|
|
3474
|
-
createJSDOM as n,
|
|
3475
|
-
getProjectRoot as o,
|
|
3476
|
-
DEFAULT_WEB_PORT as p,
|
|
3477
|
-
DimensionError as q,
|
|
3478
|
-
VECTOR_DIMENSION as r,
|
|
3479
|
-
setLogLevel as s
|
|
3480
|
-
};
|
|
3481
|
-
//# sourceMappingURL=DocumentManagementService-BGW9iWNn.js.map
|