@arabold/docs-mcp-server 1.13.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3481 +0,0 @@
1
- import { v4 } from "uuid";
2
- import psl from "psl";
3
- import axios from "axios";
4
- import { HeaderGenerator } from "header-generator";
5
- import fs from "node:fs/promises";
6
- import path from "node:path";
7
- import * as cheerio from "cheerio";
8
- import "node:vm";
9
- import { VirtualConsole, JSDOM } from "jsdom";
10
- import { chromium } from "playwright";
11
- import { gfm } from "@joplin/turndown-plugin-gfm";
12
- import TurndownService from "turndown";
13
- import { TextDecoder } from "node:util";
14
- import { URL as URL$1, fileURLToPath } from "node:url";
15
- import * as semver from "semver";
16
- import semver__default from "semver";
17
- import fs$1 from "node:fs";
18
- import envPaths from "env-paths";
19
- import Fuse from "fuse.js";
20
- import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
21
- import remarkGfm from "remark-gfm";
22
- import remarkHtml from "remark-html";
23
- import remarkParse from "remark-parse";
24
- import { unified } from "unified";
25
- import Database from "better-sqlite3";
26
- import * as sqliteVec from "sqlite-vec";
27
- var LogLevel = /* @__PURE__ */ ((LogLevel2) => {
28
- LogLevel2[LogLevel2["ERROR"] = 0] = "ERROR";
29
- LogLevel2[LogLevel2["WARN"] = 1] = "WARN";
30
- LogLevel2[LogLevel2["INFO"] = 2] = "INFO";
31
- LogLevel2[LogLevel2["DEBUG"] = 3] = "DEBUG";
32
- return LogLevel2;
33
- })(LogLevel || {});
34
- let currentLogLevel = 2;
35
- function setLogLevel(level) {
36
- currentLogLevel = level;
37
- }
38
- const logger = {
39
- /**
40
- * Logs a debug message if the current log level is DEBUG or higher.
41
- * @param message - The message to log.
42
- */
43
- debug: (message) => {
44
- if (currentLogLevel >= 3) {
45
- console.debug(message);
46
- }
47
- },
48
- /**
49
- * Logs an info message if the current log level is INFO or higher.
50
- * @param message - The message to log.
51
- */
52
- info: (message) => {
53
- if (currentLogLevel >= 2) {
54
- console.log(message);
55
- }
56
- },
57
- /**
58
- * Logs a warning message if the current log level is WARN or higher.
59
- * @param message - The message to log.
60
- */
61
- warn: (message) => {
62
- if (currentLogLevel >= 1) {
63
- console.warn(message);
64
- }
65
- },
66
- /**
67
- * Logs an error message if the current log level is ERROR or higher (always logs).
68
- * @param message - The message to log.
69
- */
70
- error: (message) => {
71
- if (currentLogLevel >= 0) {
72
- console.error(message);
73
- }
74
- }
75
- };
76
- class ScraperError extends Error {
77
- constructor(message, isRetryable = false, cause) {
78
- super(message);
79
- this.isRetryable = isRetryable;
80
- this.cause = cause;
81
- this.name = this.constructor.name;
82
- if (cause?.stack) {
83
- this.stack = `${this.stack}
84
- Caused by: ${cause.stack}`;
85
- }
86
- }
87
- }
88
- class InvalidUrlError extends ScraperError {
89
- constructor(url, cause) {
90
- super(`Invalid URL: ${url}`, false, cause);
91
- }
92
- }
93
- class RedirectError extends ScraperError {
94
- constructor(originalUrl, redirectUrl, statusCode) {
95
- super(
96
- `Redirect detected from ${originalUrl} to ${redirectUrl} (status: ${statusCode})`,
97
- false
98
- );
99
- this.originalUrl = originalUrl;
100
- this.redirectUrl = redirectUrl;
101
- this.statusCode = statusCode;
102
- }
103
- }
104
- const defaultNormalizerOptions = {
105
- ignoreCase: true,
106
- removeHash: true,
107
- removeTrailingSlash: true,
108
- removeQuery: false,
109
- removeIndex: true
110
- };
111
- function normalizeUrl(url, options = defaultNormalizerOptions) {
112
- try {
113
- const parsedUrl = new URL(url);
114
- const finalOptions = { ...defaultNormalizerOptions, ...options };
115
- const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
116
- if (finalOptions.removeIndex) {
117
- normalized.pathname = normalized.pathname.replace(
118
- /\/index\.(html|htm|asp|php|jsp)$/i,
119
- "/"
120
- );
121
- }
122
- if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
123
- normalized.pathname = normalized.pathname.replace(/\/+$/, "");
124
- }
125
- const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
126
- const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
127
- let result = normalized.origin + normalized.pathname;
128
- if (preservedSearch) {
129
- result += preservedSearch;
130
- }
131
- if (preservedHash) {
132
- result += preservedHash;
133
- }
134
- if (finalOptions.ignoreCase) {
135
- result = result.toLowerCase();
136
- }
137
- return result;
138
- } catch {
139
- return url;
140
- }
141
- }
142
- function validateUrl(url) {
143
- try {
144
- new URL(url);
145
- } catch (error) {
146
- throw new InvalidUrlError(url, error instanceof Error ? error : void 0);
147
- }
148
- }
149
- function hasSameHostname(urlA, urlB) {
150
- return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
151
- }
152
- function hasSameDomain(urlA, urlB) {
153
- const domainA = psl.get(urlA.hostname.toLowerCase());
154
- const domainB = psl.get(urlB.hostname.toLowerCase());
155
- return domainA !== null && domainA === domainB;
156
- }
157
- function isSubpath(baseUrl, targetUrl) {
158
- const basePath = baseUrl.pathname.endsWith("/") ? baseUrl.pathname : `${baseUrl.pathname}/`;
159
- return targetUrl.pathname.startsWith(basePath);
160
- }
161
- const DEFAULT_MAX_PAGES$1 = 1e3;
162
- const DEFAULT_MAX_DEPTH$1 = 3;
163
- const DEFAULT_MAX_CONCURRENCY = 3;
164
- const DEFAULT_PROTOCOL = "stdio";
165
- const DEFAULT_HTTP_PORT = 6280;
166
- const DEFAULT_WEB_PORT = 6281;
167
- const FETCHER_MAX_RETRIES = 6;
168
- const FETCHER_BASE_DELAY = 1e3;
169
- const SPLITTER_MIN_CHUNK_SIZE = 500;
170
- const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
171
- const SPLITTER_MAX_CHUNK_SIZE = 5e3;
172
- const EMBEDDING_BATCH_SIZE = 300;
173
- class MimeTypeUtils {
174
- /**
175
- * Parses a Content-Type header string into its MIME type and charset.
176
- * @param contentTypeHeader The Content-Type header string (e.g., "text/html; charset=utf-8").
177
- * @returns A ParsedContentType object, or a default if parsing fails.
178
- */
179
- static parseContentType(contentTypeHeader) {
180
- if (!contentTypeHeader) {
181
- return { mimeType: "application/octet-stream" };
182
- }
183
- const parts = contentTypeHeader.split(";").map((part) => part.trim());
184
- const mimeType = parts[0].toLowerCase();
185
- let charset;
186
- for (let i = 1; i < parts.length; i++) {
187
- const param = parts[i];
188
- if (param.toLowerCase().startsWith("charset=")) {
189
- charset = param.substring("charset=".length).toLowerCase();
190
- break;
191
- }
192
- }
193
- return { mimeType, charset };
194
- }
195
- /**
196
- * Checks if a MIME type represents HTML content.
197
- */
198
- static isHtml(mimeType) {
199
- return mimeType === "text/html" || mimeType === "application/xhtml+xml";
200
- }
201
- /**
202
- * Checks if a MIME type represents Markdown content.
203
- */
204
- static isMarkdown(mimeType) {
205
- return mimeType === "text/markdown" || mimeType === "text/x-markdown";
206
- }
207
- /**
208
- * Checks if a MIME type represents plain text content.
209
- */
210
- static isText(mimeType) {
211
- return mimeType.startsWith("text/");
212
- }
213
- // Extend with more helpers as needed (isJson, isXml, isPdf, etc.)
214
- }
215
- class FingerprintGenerator {
216
- headerGenerator;
217
- /**
218
- * Creates an instance of FingerprintGenerator.
219
- * @param options Optional configuration for the header generator.
220
- */
221
- constructor(options) {
222
- const defaultOptions = {
223
- browsers: [{ name: "chrome", minVersion: 100 }, "firefox", "safari"],
224
- devices: ["desktop", "mobile"],
225
- operatingSystems: ["windows", "linux", "macos", "android", "ios"],
226
- locales: ["en-US", "en"],
227
- httpVersion: "2"
228
- };
229
- this.headerGenerator = new HeaderGenerator({
230
- ...defaultOptions,
231
- ...options
232
- });
233
- }
234
- /**
235
- * Generates a set of realistic HTTP headers.
236
- * @returns A set of realistic HTTP headers.
237
- */
238
- generateHeaders() {
239
- return this.headerGenerator.getHeaders();
240
- }
241
- }
242
- class HttpFetcher {
243
- retryableStatusCodes = [
244
- 408,
245
- // Request Timeout
246
- 429,
247
- // Too Many Requests
248
- 500,
249
- // Internal Server Error
250
- 502,
251
- // Bad Gateway
252
- 503,
253
- // Service Unavailable
254
- 504,
255
- // Gateway Timeout
256
- 525
257
- // SSL Handshake Failed (Cloudflare specific)
258
- ];
259
- fingerprintGenerator;
260
- constructor() {
261
- this.fingerprintGenerator = new FingerprintGenerator();
262
- }
263
- canFetch(source) {
264
- return source.startsWith("http://") || source.startsWith("https://");
265
- }
266
- async delay(ms) {
267
- return new Promise((resolve) => setTimeout(resolve, ms));
268
- }
269
- async fetch(source, options) {
270
- const maxRetries = options?.maxRetries ?? FETCHER_MAX_RETRIES;
271
- const baseDelay = options?.retryDelay ?? FETCHER_BASE_DELAY;
272
- const followRedirects = options?.followRedirects ?? true;
273
- for (let attempt = 0; attempt <= maxRetries; attempt++) {
274
- try {
275
- const fingerprint = this.fingerprintGenerator.generateHeaders();
276
- const headers = {
277
- ...fingerprint,
278
- ...options?.headers
279
- // User-provided headers override generated ones
280
- };
281
- const config = {
282
- responseType: "arraybuffer",
283
- // For handling both text and binary
284
- headers,
285
- timeout: options?.timeout,
286
- signal: options?.signal,
287
- // Pass signal to axios
288
- // Axios follows redirects by default, we need to explicitly disable it if needed
289
- maxRedirects: followRedirects ? 5 : 0
290
- };
291
- const response = await axios.get(source, config);
292
- const contentTypeHeader = response.headers["content-type"];
293
- const { mimeType, charset } = MimeTypeUtils.parseContentType(contentTypeHeader);
294
- const contentEncoding = response.headers["content-encoding"];
295
- return {
296
- content: response.data,
297
- mimeType,
298
- charset,
299
- encoding: contentEncoding,
300
- source
301
- };
302
- } catch (error) {
303
- const axiosError = error;
304
- const status = axiosError.response?.status;
305
- const code = axiosError.code;
306
- if (!followRedirects && status && status >= 300 && status < 400) {
307
- const location = axiosError.response?.headers?.location;
308
- if (location) {
309
- throw new RedirectError(source, location, status);
310
- }
311
- }
312
- if (attempt < maxRetries && (status === void 0 || this.retryableStatusCodes.includes(status))) {
313
- const delay = baseDelay * 2 ** attempt;
314
- logger.warn(
315
- `Attempt ${attempt + 1}/${maxRetries + 1} failed for ${source} (Status: ${status}, Code: ${code}). Retrying in ${delay}ms...`
316
- );
317
- await this.delay(delay);
318
- continue;
319
- }
320
- throw new ScraperError(
321
- `Failed to fetch ${source} after ${attempt + 1} attempts: ${axiosError.message ?? "Unknown error"}`,
322
- true,
323
- error instanceof Error ? error : void 0
324
- );
325
- }
326
- }
327
- throw new ScraperError(
328
- `Failed to fetch ${source} after ${maxRetries + 1} attempts`,
329
- true
330
- );
331
- }
332
- }
333
- class FileFetcher {
334
- canFetch(source) {
335
- return source.startsWith("file://");
336
- }
337
- async fetch(source, options) {
338
- const filePath = source.replace(/^file:\/\//, "");
339
- logger.info(`Fetching file: ${filePath}`);
340
- try {
341
- const content = await fs.readFile(filePath);
342
- const ext = path.extname(filePath).toLowerCase();
343
- const mimeType = this.getMimeType(ext);
344
- return {
345
- content,
346
- mimeType,
347
- source,
348
- encoding: "utf-8"
349
- // Assume UTF-8 for text files
350
- };
351
- } catch (error) {
352
- throw new ScraperError(
353
- `Failed to read file ${filePath}: ${error.message ?? "Unknown error"}`,
354
- false,
355
- error instanceof Error ? error : void 0
356
- );
357
- }
358
- }
359
- getMimeType(ext) {
360
- switch (ext) {
361
- case ".html":
362
- case ".htm":
363
- return "text/html";
364
- case ".md":
365
- return "text/markdown";
366
- case ".txt":
367
- return "text/plain";
368
- default:
369
- return "application/octet-stream";
370
- }
371
- }
372
- }
373
- class HtmlCheerioParserMiddleware {
374
- async process(context, next) {
375
- try {
376
- logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
377
- const $ = cheerio.load(context.content);
378
- context.dom = $;
379
- await next();
380
- } catch (error) {
381
- logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
382
- context.errors.push(
383
- error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
384
- );
385
- return;
386
- }
387
- }
388
- }
389
- function createJSDOM(html, options) {
390
- const virtualConsole = new VirtualConsole();
391
- virtualConsole.on("error", () => {
392
- });
393
- virtualConsole.on("warn", () => {
394
- });
395
- virtualConsole.on("info", () => {
396
- });
397
- virtualConsole.on("debug", () => {
398
- });
399
- virtualConsole.on("log", () => {
400
- });
401
- const defaultOptions = {
402
- virtualConsole
403
- };
404
- const finalOptions = { ...defaultOptions, ...options };
405
- return new JSDOM(html, finalOptions);
406
- }
407
- class HtmlLinkExtractorMiddleware {
408
- /**
409
- * Processes the context to extract links from the sanitized HTML body.
410
- * @param context The current middleware context.
411
- * @param next Function to call the next middleware.
412
- */
413
- async process(context, next) {
414
- const $ = context.dom;
415
- if (!$) {
416
- logger.warn(
417
- `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
418
- );
419
- await next();
420
- return;
421
- }
422
- try {
423
- const linkElements = $("a[href]");
424
- logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
425
- const extractedLinks = [];
426
- linkElements.each((index, element) => {
427
- const href = $(element).attr("href");
428
- if (href && href.trim() !== "") {
429
- try {
430
- const urlObj = new URL(href, context.source);
431
- if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
432
- logger.debug(`Ignoring link with invalid protocol: ${href}`);
433
- return;
434
- }
435
- extractedLinks.push(urlObj.href);
436
- } catch (e) {
437
- logger.debug(`Ignoring invalid URL syntax: ${href}`);
438
- }
439
- }
440
- });
441
- context.links = [...new Set(extractedLinks)];
442
- logger.debug(
443
- `Extracted ${context.links.length} unique, valid links from ${context.source}`
444
- );
445
- } catch (error) {
446
- logger.error(`Error extracting links from ${context.source}: ${error}`);
447
- context.errors.push(
448
- new Error(
449
- `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
450
- )
451
- );
452
- }
453
- await next();
454
- }
455
- }
456
- class HtmlMetadataExtractorMiddleware {
457
- /**
458
- * Processes the context to extract the HTML title.
459
- * @param context The current processing context.
460
- * @param next Function to call the next middleware.
461
- */
462
- async process(context, next) {
463
- const $ = context.dom;
464
- if (!$) {
465
- logger.warn(
466
- `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
467
- );
468
- await next();
469
- return;
470
- }
471
- try {
472
- let title = $("title").first().text().trim();
473
- if (!title) {
474
- title = $("h1").first().text().trim();
475
- }
476
- title = title || "Untitled";
477
- title = title.replace(/\s+/g, " ").trim();
478
- context.metadata.title = title;
479
- logger.debug(`Extracted title: "${title}" from ${context.source}`);
480
- } catch (error) {
481
- logger.error(`Error extracting metadata from ${context.source}: ${error}`);
482
- context.errors.push(
483
- new Error(
484
- `Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
485
- )
486
- );
487
- }
488
- await next();
489
- }
490
- }
491
- var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
492
- ScrapeMode2["Fetch"] = "fetch";
493
- ScrapeMode2["Playwright"] = "playwright";
494
- ScrapeMode2["Auto"] = "auto";
495
- return ScrapeMode2;
496
- })(ScrapeMode || {});
497
- class HtmlPlaywrightMiddleware {
498
- browser = null;
499
- /**
500
- * Initializes the Playwright browser instance.
501
- * Consider making this more robust (e.g., lazy initialization, singleton).
502
- */
503
- async ensureBrowser() {
504
- if (!this.browser || !this.browser.isConnected()) {
505
- const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
506
- logger.debug(
507
- `Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
508
- );
509
- this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
510
- this.browser.on("disconnected", () => {
511
- logger.debug("Playwright browser instance disconnected.");
512
- this.browser = null;
513
- });
514
- }
515
- return this.browser;
516
- }
517
- /**
518
- * Closes the Playwright browser instance if it exists.
519
- * Should be called during application shutdown.
520
- */
521
- async closeBrowser() {
522
- if (this.browser?.isConnected()) {
523
- logger.debug("Closing Playwright browser instance...");
524
- await this.browser.close();
525
- this.browser = null;
526
- }
527
- }
528
- async process(context, next) {
529
- const scrapeMode = context.options?.scrapeMode ?? ScrapeMode.Auto;
530
- const shouldRunPlaywright = scrapeMode === ScrapeMode.Playwright || scrapeMode === ScrapeMode.Auto;
531
- if (!shouldRunPlaywright) {
532
- logger.debug(
533
- `Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
534
- );
535
- await next();
536
- return;
537
- }
538
- logger.debug(
539
- `Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
540
- );
541
- let page = null;
542
- let renderedHtml = null;
543
- try {
544
- const browser = await this.ensureBrowser();
545
- page = await browser.newPage();
546
- logger.debug(`Playwright: Processing ${context.source}`);
547
- await page.route("**/*", (route) => {
548
- if (route.request().url() === context.source) {
549
- return route.fulfill({
550
- status: 200,
551
- contentType: "text/html",
552
- body: context.content
553
- });
554
- }
555
- const resourceType = route.request().resourceType();
556
- if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
557
- return route.abort();
558
- }
559
- return route.continue();
560
- });
561
- await page.goto(context.source, {
562
- waitUntil: "load"
563
- });
564
- renderedHtml = await page.content();
565
- logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
566
- } catch (error) {
567
- logger.error(`Playwright failed to render ${context.source}: ${error}`);
568
- context.errors.push(
569
- error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
570
- );
571
- } finally {
572
- if (page) {
573
- await page.unroute("**/*");
574
- await page.close();
575
- }
576
- }
577
- if (renderedHtml !== null) {
578
- context.content = renderedHtml;
579
- logger.debug(
580
- `Playwright middleware updated content for ${context.source}. Proceeding.`
581
- );
582
- } else {
583
- logger.warn(
584
- `Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
585
- );
586
- }
587
- await next();
588
- }
589
- }
590
- class HtmlSanitizerMiddleware {
591
- // Default selectors to remove
592
- defaultSelectorsToRemove = [
593
- "nav",
594
- "footer",
595
- "script",
596
- "style",
597
- "noscript",
598
- "svg",
599
- "link",
600
- "meta",
601
- "iframe",
602
- "header",
603
- "button",
604
- "input",
605
- "textarea",
606
- "select",
607
- // "form", // Keep commented
608
- ".ads",
609
- ".advertisement",
610
- ".banner",
611
- ".cookie-banner",
612
- ".cookie-consent",
613
- ".hidden",
614
- ".hide",
615
- ".modal",
616
- ".nav-bar",
617
- ".overlay",
618
- ".popup",
619
- ".promo",
620
- ".mw-editsection",
621
- ".side-bar",
622
- ".social-share",
623
- ".sticky",
624
- "#ads",
625
- "#banner",
626
- "#cookieBanner",
627
- "#modal",
628
- "#nav",
629
- "#overlay",
630
- "#popup",
631
- "#sidebar",
632
- "#socialMediaBox",
633
- "#stickyHeader",
634
- "#ad-container",
635
- ".ad-container",
636
- ".login-form",
637
- ".signup-form",
638
- ".tooltip",
639
- ".dropdown-menu",
640
- // ".alert", // Keep commented
641
- ".breadcrumb",
642
- ".pagination",
643
- // '[role="alert"]', // Keep commented
644
- '[role="banner"]',
645
- '[role="dialog"]',
646
- '[role="alertdialog"]',
647
- '[role="region"][aria-label*="skip" i]',
648
- '[aria-modal="true"]',
649
- ".noprint"
650
- ];
651
- async process(context, next) {
652
- const $ = context.dom;
653
- if (!$) {
654
- logger.warn(
655
- `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
656
- );
657
- await next();
658
- return;
659
- }
660
- try {
661
- const selectorsToRemove = [
662
- ...context.options.excludeSelectors || [],
663
- // Use options from the context
664
- ...this.defaultSelectorsToRemove
665
- ];
666
- logger.debug(
667
- `Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
668
- );
669
- let removedCount = 0;
670
- for (const selector of selectorsToRemove) {
671
- try {
672
- const elements = $(selector);
673
- const count = elements.length;
674
- if (count > 0) {
675
- elements.remove();
676
- removedCount += count;
677
- }
678
- } catch (selectorError) {
679
- logger.warn(
680
- `Potentially invalid selector "${selector}" during element removal: ${selectorError}`
681
- );
682
- context.errors.push(
683
- new Error(`Invalid selector "${selector}": ${selectorError}`)
684
- );
685
- }
686
- }
687
- logger.debug(`Removed ${removedCount} elements for ${context.source}`);
688
- } catch (error) {
689
- logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
690
- context.errors.push(
691
- error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
692
- );
693
- }
694
- await next();
695
- }
696
- }
697
- class HtmlToMarkdownMiddleware {
698
- turndownService;
699
- constructor() {
700
- this.turndownService = new TurndownService({
701
- headingStyle: "atx",
702
- hr: "---",
703
- bulletListMarker: "-",
704
- codeBlockStyle: "fenced",
705
- emDelimiter: "_",
706
- strongDelimiter: "**",
707
- linkStyle: "inlined"
708
- });
709
- this.turndownService.use(gfm);
710
- this.addCustomRules();
711
- }
712
- addCustomRules() {
713
- this.turndownService.addRule("pre", {
714
- filter: ["pre"],
715
- replacement: (content, node) => {
716
- const element = node;
717
- let language = element.getAttribute("data-language") || "";
718
- if (!language) {
719
- const highlightElement = element.closest(
720
- '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
721
- ) || element.querySelector(
722
- '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
723
- );
724
- if (highlightElement) {
725
- const className = highlightElement.className;
726
- const match = className.match(
727
- /(?:highlight-source-|highlight-|language-)(\w+)/
728
- );
729
- if (match) language = match[1];
730
- }
731
- }
732
- const brElements = Array.from(element.querySelectorAll("br"));
733
- for (const br of brElements) {
734
- br.replaceWith("\n");
735
- }
736
- const text = element.textContent || "";
737
- return `
738
- \`\`\`${language}
739
- ${text.replace(/^\n+|\n+$/g, "")}
740
- \`\`\`
741
- `;
742
- }
743
- });
744
- this.turndownService.addRule("anchor", {
745
- filter: ["a"],
746
- replacement: (content, node) => {
747
- const href = node.getAttribute("href");
748
- if (!content || content === "#") {
749
- return "";
750
- }
751
- if (!href) {
752
- return content;
753
- }
754
- return `[${content}](${href})`;
755
- }
756
- });
757
- }
758
- /**
759
- * Processes the context to convert the sanitized HTML body node to Markdown.
760
- * @param context The current processing context.
761
- * @param next Function to call the next middleware.
762
- */
763
- async process(context, next) {
764
- const $ = context.dom;
765
- if (!$) {
766
- logger.warn(
767
- `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware ran correctly.`
768
- );
769
- await next();
770
- return;
771
- }
772
- try {
773
- logger.debug(`Converting HTML content to Markdown for ${context.source}`);
774
- const htmlToConvert = $("body").html() || $.html();
775
- const markdown = this.turndownService.turndown(htmlToConvert).trim();
776
- if (!markdown) {
777
- const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
778
- logger.warn(warnMsg);
779
- context.content = "";
780
- } else {
781
- context.content = markdown;
782
- logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
783
- }
784
- } catch (error) {
785
- logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
786
- context.errors.push(
787
- new Error(
788
- `Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
789
- )
790
- );
791
- }
792
- await next();
793
- }
794
- }
795
- class MarkdownLinkExtractorMiddleware {
796
- /**
797
- * Processes the context. Currently a no-op regarding link extraction.
798
- * @param context The current processing context.
799
- * @param next Function to call the next middleware.
800
- */
801
- async process(context, next) {
802
- if (!Array.isArray(context.links)) {
803
- context.links = [];
804
- }
805
- await next();
806
- }
807
- }
808
- class MarkdownMetadataExtractorMiddleware {
809
- /**
810
- * Processes the context to extract the title from Markdown.
811
- * @param context The current processing context.
812
- * @param next Function to call the next middleware.
813
- */
814
- async process(context, next) {
815
- try {
816
- let title = "Untitled";
817
- const match = context.content.match(/^#\s+(.*)$/m);
818
- if (match?.[1]) {
819
- title = match[1].trim();
820
- }
821
- context.metadata.title = title;
822
- } catch (error) {
823
- context.errors.push(
824
- new Error(
825
- `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
826
- )
827
- );
828
- }
829
- await next();
830
- }
831
- }
832
- function convertToString(content, charset) {
833
- if (Buffer.isBuffer(content)) {
834
- const decoder = new TextDecoder(charset || "utf-8");
835
- return decoder.decode(content);
836
- }
837
- return content;
838
- }
839
- class BasePipeline {
840
- /**
841
- * Determines if this pipeline can process the given content.
842
- * Must be implemented by derived classes.
843
- */
844
- canProcess(_rawContent) {
845
- throw new Error("Method not implemented.");
846
- }
847
- /**
848
- * Processes the raw content through the pipeline.
849
- * Must be implemented by derived classes.
850
- */
851
- async process(_rawContent, _options, _fetcher) {
852
- throw new Error("Method not implemented.");
853
- }
854
- /**
855
- * Executes a middleware stack on the given context.
856
- * This is a utility method used by derived pipeline classes.
857
- *
858
- * @param middleware - The middleware stack to execute
859
- * @param context - The context to process
860
- */
861
- async executeMiddlewareStack(middleware, context) {
862
- let index = -1;
863
- const dispatch = async (i) => {
864
- if (i <= index) throw new Error("next() called multiple times");
865
- index = i;
866
- const mw = middleware[i];
867
- if (!mw) return;
868
- await mw.process(context, dispatch.bind(null, i + 1));
869
- };
870
- try {
871
- await dispatch(0);
872
- } catch (error) {
873
- context.errors.push(error instanceof Error ? error : new Error(String(error)));
874
- }
875
- }
876
- /**
877
- * Cleans up resources when the pipeline is no longer needed.
878
- * Default implementation does nothing.
879
- */
880
- async close() {
881
- }
882
- }
883
- class HtmlPipeline extends BasePipeline {
884
- playwrightMiddleware;
885
- standardMiddleware;
886
- constructor() {
887
- super();
888
- this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
889
- this.standardMiddleware = [
890
- new HtmlCheerioParserMiddleware(),
891
- new HtmlMetadataExtractorMiddleware(),
892
- new HtmlLinkExtractorMiddleware(),
893
- new HtmlSanitizerMiddleware(),
894
- new HtmlToMarkdownMiddleware()
895
- ];
896
- }
897
- canProcess(rawContent) {
898
- return MimeTypeUtils.isHtml(rawContent.mimeType);
899
- }
900
- async process(rawContent, options, fetcher) {
901
- const contentString = convertToString(rawContent.content, rawContent.charset);
902
- const context = {
903
- content: contentString,
904
- source: rawContent.source,
905
- metadata: {},
906
- links: [],
907
- errors: [],
908
- options,
909
- fetcher
910
- };
911
- let middleware = [...this.standardMiddleware];
912
- if (options.scrapeMode === "playwright" || options.scrapeMode === "auto") {
913
- middleware = [this.playwrightMiddleware, ...middleware];
914
- }
915
- await this.executeMiddlewareStack(middleware, context);
916
- return {
917
- textContent: typeof context.content === "string" ? context.content : "",
918
- metadata: context.metadata,
919
- links: context.links,
920
- errors: context.errors
921
- };
922
- }
923
- async close() {
924
- await this.playwrightMiddleware.closeBrowser();
925
- }
926
- }
927
- class MarkdownPipeline extends BasePipeline {
928
- middleware;
929
- constructor() {
930
- super();
931
- this.middleware = [
932
- new MarkdownMetadataExtractorMiddleware(),
933
- new MarkdownLinkExtractorMiddleware()
934
- ];
935
- }
936
- canProcess(rawContent) {
937
- if (!rawContent.mimeType) return false;
938
- return MimeTypeUtils.isMarkdown(rawContent.mimeType) || MimeTypeUtils.isText(rawContent.mimeType);
939
- }
940
- async process(rawContent, options, fetcher) {
941
- const contentString = convertToString(rawContent.content, rawContent.charset);
942
- const context = {
943
- content: contentString,
944
- source: rawContent.source,
945
- metadata: {},
946
- links: [],
947
- errors: [],
948
- options,
949
- fetcher
950
- };
951
- await this.executeMiddlewareStack(this.middleware, context);
952
- return {
953
- textContent: typeof context.content === "string" ? context.content : "",
954
- metadata: context.metadata,
955
- links: context.links,
956
- errors: context.errors
957
- };
958
- }
959
- async close() {
960
- }
961
- }
962
- class PipelineError extends Error {
963
- constructor(message, cause) {
964
- super(message);
965
- this.cause = cause;
966
- this.name = this.constructor.name;
967
- if (cause?.stack) {
968
- this.stack = `${this.stack}
969
- Caused by: ${cause.stack}`;
970
- }
971
- }
972
- }
973
- class PipelineStateError extends PipelineError {
974
- }
975
- class CancellationError extends PipelineError {
976
- constructor(message = "Operation cancelled") {
977
- super(message);
978
- }
979
- }
980
- const DEFAULT_MAX_PAGES = 100;
981
- const DEFAULT_MAX_DEPTH = 3;
982
- const DEFAULT_CONCURRENCY$1 = 3;
983
- class BaseScraperStrategy {
984
- visited = /* @__PURE__ */ new Set();
985
- pageCount = 0;
986
- options;
987
- constructor(options = {}) {
988
- this.options = options;
989
- }
990
- // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
991
- async processBatch(batch, baseUrl, options, progressCallback, signal) {
992
- const results = await Promise.all(
993
- batch.map(async (item) => {
994
- if (signal?.aborted) {
995
- throw new CancellationError("Scraping cancelled during batch processing");
996
- }
997
- const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
998
- if (item.depth > maxDepth) {
999
- return [];
1000
- }
1001
- try {
1002
- const result = await this.processItem(item, options, void 0, signal);
1003
- if (result.document) {
1004
- this.pageCount++;
1005
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
1006
- logger.info(
1007
- `🌐 Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
1008
- );
1009
- await progressCallback({
1010
- pagesScraped: this.pageCount,
1011
- maxPages,
1012
- currentUrl: item.url,
1013
- depth: item.depth,
1014
- maxDepth,
1015
- document: result.document
1016
- });
1017
- }
1018
- const nextItems = result.links || [];
1019
- return nextItems.map((value) => {
1020
- try {
1021
- const targetUrl = new URL$1(value, baseUrl);
1022
- return {
1023
- url: targetUrl.href,
1024
- depth: item.depth + 1
1025
- };
1026
- } catch (error) {
1027
- logger.warn(`❌ Invalid URL: ${value}`);
1028
- }
1029
- return null;
1030
- }).filter((item2) => item2 !== null);
1031
- } catch (error) {
1032
- if (options.ignoreErrors) {
1033
- logger.error(`❌ Failed to process ${item.url}: ${error}`);
1034
- return [];
1035
- }
1036
- throw error;
1037
- }
1038
- })
1039
- );
1040
- const allLinks = results.flat();
1041
- const uniqueLinks = [];
1042
- for (const item of allLinks) {
1043
- const normalizedUrl = normalizeUrl(item.url, this.options.urlNormalizerOptions);
1044
- if (!this.visited.has(normalizedUrl)) {
1045
- this.visited.add(normalizedUrl);
1046
- uniqueLinks.push(item);
1047
- }
1048
- }
1049
- return uniqueLinks;
1050
- }
1051
- async scrape(options, progressCallback, signal) {
1052
- this.visited.clear();
1053
- this.pageCount = 0;
1054
- const baseUrl = new URL$1(options.url);
1055
- const queue = [{ url: options.url, depth: 0 }];
1056
- this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
1057
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
1058
- const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY$1;
1059
- while (queue.length > 0 && this.pageCount < maxPages) {
1060
- if (signal?.aborted) {
1061
- logger.info("Scraping cancelled by signal.");
1062
- throw new CancellationError("Scraping cancelled by signal");
1063
- }
1064
- const remainingPages = maxPages - this.pageCount;
1065
- if (remainingPages <= 0) {
1066
- break;
1067
- }
1068
- const batchSize = Math.min(
1069
- maxConcurrency,
1070
- // Use variable
1071
- remainingPages,
1072
- queue.length
1073
- );
1074
- const batch = queue.splice(0, batchSize);
1075
- const newUrls = await this.processBatch(
1076
- batch,
1077
- baseUrl,
1078
- options,
1079
- progressCallback,
1080
- signal
1081
- );
1082
- queue.push(...newUrls);
1083
- }
1084
- }
1085
- }
1086
- class WebScraperStrategy extends BaseScraperStrategy {
1087
- httpFetcher = new HttpFetcher();
1088
- shouldFollowLinkFn;
1089
- htmlPipeline;
1090
- markdownPipeline;
1091
- pipelines;
1092
- constructor(options = {}) {
1093
- super({ urlNormalizerOptions: options.urlNormalizerOptions });
1094
- this.shouldFollowLinkFn = options.shouldFollowLink;
1095
- this.htmlPipeline = new HtmlPipeline();
1096
- this.markdownPipeline = new MarkdownPipeline();
1097
- this.pipelines = [this.htmlPipeline, this.markdownPipeline];
1098
- }
1099
- canHandle(url) {
1100
- try {
1101
- const parsedUrl = new URL(url);
1102
- return parsedUrl.protocol === "http:" || parsedUrl.protocol === "https:";
1103
- } catch {
1104
- return false;
1105
- }
1106
- }
1107
- /**
1108
- * Determines if a target URL should be followed based on the scope setting.
1109
- */
1110
- isInScope(baseUrl, targetUrl, scope) {
1111
- try {
1112
- if (scope === "domain") {
1113
- return hasSameDomain(baseUrl, targetUrl);
1114
- }
1115
- if (scope === "hostname") {
1116
- return hasSameHostname(baseUrl, targetUrl);
1117
- }
1118
- return hasSameHostname(baseUrl, targetUrl) && isSubpath(baseUrl, targetUrl);
1119
- } catch {
1120
- return false;
1121
- }
1122
- }
1123
- async processItem(item, options, _progressCallback, signal) {
1124
- const { url } = item;
1125
- try {
1126
- const fetchOptions = {
1127
- signal,
1128
- followRedirects: options.followRedirects
1129
- };
1130
- const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
1131
- let processed;
1132
- for (const pipeline of this.pipelines) {
1133
- if (pipeline.canProcess(rawContent)) {
1134
- processed = await pipeline.process(rawContent, options, this.httpFetcher);
1135
- break;
1136
- }
1137
- }
1138
- if (!processed) {
1139
- logger.warn(
1140
- `Unsupported content type "${rawContent.mimeType}" for URL ${url}. Skipping processing.`
1141
- );
1142
- return { document: void 0, links: [] };
1143
- }
1144
- for (const err of processed.errors) {
1145
- logger.warn(`Processing error for ${url}: ${err.message}`);
1146
- }
1147
- if (!processed.textContent || !processed.textContent.trim()) {
1148
- logger.warn(`No processable content found for ${url} after pipeline execution.`);
1149
- return { document: void 0, links: processed.links };
1150
- }
1151
- const baseUrl = new URL(options.url);
1152
- const filteredLinks = processed.links.filter((link) => {
1153
- try {
1154
- const targetUrl = new URL(link);
1155
- const scope = options.scope || "subpages";
1156
- return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
1157
- } catch {
1158
- return false;
1159
- }
1160
- });
1161
- return {
1162
- document: {
1163
- content: processed.textContent,
1164
- metadata: {
1165
- url,
1166
- title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
1167
- library: options.library,
1168
- version: options.version,
1169
- ...processed.metadata
1170
- }
1171
- },
1172
- links: filteredLinks
1173
- };
1174
- } catch (error) {
1175
- logger.error(`Failed processing page ${url}: ${error}`);
1176
- throw error;
1177
- }
1178
- }
1179
- /**
1180
- * Overrides the base scrape method to ensure the Playwright browser is closed
1181
- * after the scraping process completes or errors out.
1182
- */
1183
- async scrape(options, progressCallback, signal) {
1184
- try {
1185
- await super.scrape(options, progressCallback, signal);
1186
- } finally {
1187
- await this.htmlPipeline.close();
1188
- await this.markdownPipeline.close();
1189
- }
1190
- }
1191
- }
1192
- class GitHubScraperStrategy {
1193
- defaultStrategy;
1194
- canHandle(url) {
1195
- const { hostname } = new URL(url);
1196
- return ["github.com", "www.github.com"].includes(hostname);
1197
- }
1198
- constructor() {
1199
- const shouldFollowLink = (baseUrl, targetUrl) => {
1200
- if (this.getRepoPath(baseUrl) !== this.getRepoPath(targetUrl)) {
1201
- return false;
1202
- }
1203
- const path2 = targetUrl.pathname;
1204
- if (path2 === this.getRepoPath(targetUrl)) {
1205
- return true;
1206
- }
1207
- if (path2.startsWith(`${this.getRepoPath(targetUrl)}/wiki`)) {
1208
- return true;
1209
- }
1210
- if (path2.startsWith(`${this.getRepoPath(targetUrl)}/blob/`) && path2.endsWith(".md")) {
1211
- return true;
1212
- }
1213
- return false;
1214
- };
1215
- this.defaultStrategy = new WebScraperStrategy({
1216
- urlNormalizerOptions: {
1217
- ignoreCase: true,
1218
- removeHash: true,
1219
- removeTrailingSlash: true,
1220
- removeQuery: true
1221
- // Remove query parameters like ?tab=readme-ov-file
1222
- },
1223
- shouldFollowLink
1224
- });
1225
- }
1226
- getRepoPath(url) {
1227
- const match = url.pathname.match(/^\/[^/]+\/[^/]+/);
1228
- return match?.[0] || "";
1229
- }
1230
- async scrape(options, progressCallback, signal) {
1231
- const url = new URL(options.url);
1232
- if (!url.hostname.includes("github.com")) {
1233
- throw new Error("URL must be a GitHub URL");
1234
- }
1235
- await this.defaultStrategy.scrape(options, progressCallback, signal);
1236
- }
1237
- }
1238
- class LocalFileStrategy extends BaseScraperStrategy {
1239
- fileFetcher = new FileFetcher();
1240
- htmlPipeline;
1241
- markdownPipeline;
1242
- pipelines;
1243
- constructor() {
1244
- super();
1245
- this.htmlPipeline = new HtmlPipeline();
1246
- this.markdownPipeline = new MarkdownPipeline();
1247
- this.pipelines = [this.htmlPipeline, this.markdownPipeline];
1248
- }
1249
- canHandle(url) {
1250
- return url.startsWith("file://");
1251
- }
1252
- async processItem(item, options, _progressCallback, _signal) {
1253
- const filePath = item.url.replace(/^file:\/\//, "");
1254
- const stats = await fs.stat(filePath);
1255
- if (stats.isDirectory()) {
1256
- const contents = await fs.readdir(filePath);
1257
- return {
1258
- links: contents.map((name) => `file://${path.join(filePath, name)}`)
1259
- };
1260
- }
1261
- logger.info(`📄 Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
1262
- const rawContent = await this.fileFetcher.fetch(item.url);
1263
- let processed;
1264
- for (const pipeline of this.pipelines) {
1265
- if (pipeline.canProcess(rawContent)) {
1266
- processed = await pipeline.process(rawContent, options, this.fileFetcher);
1267
- break;
1268
- }
1269
- }
1270
- if (!processed) {
1271
- logger.warn(
1272
- `Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`
1273
- );
1274
- return { document: void 0, links: [] };
1275
- }
1276
- for (const err of processed.errors) {
1277
- logger.warn(`Processing error for ${filePath}: ${err.message}`);
1278
- }
1279
- return {
1280
- document: {
1281
- content: typeof processed.textContent === "string" ? processed.textContent : "",
1282
- metadata: {
1283
- url: rawContent.source,
1284
- title: typeof processed.metadata.title === "string" ? processed.metadata.title : "Untitled",
1285
- library: options.library,
1286
- version: options.version
1287
- }
1288
- }
1289
- };
1290
- }
1291
- async scrape(options, progressCallback, signal) {
1292
- try {
1293
- await super.scrape(options, progressCallback, signal);
1294
- } finally {
1295
- await this.htmlPipeline.close();
1296
- await this.markdownPipeline.close();
1297
- }
1298
- }
1299
- }
1300
- class NpmScraperStrategy {
1301
- defaultStrategy;
1302
- canHandle(url) {
1303
- const { hostname } = new URL(url);
1304
- return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname);
1305
- }
1306
- constructor() {
1307
- this.defaultStrategy = new WebScraperStrategy({
1308
- urlNormalizerOptions: {
1309
- ignoreCase: true,
1310
- removeHash: true,
1311
- removeTrailingSlash: true,
1312
- removeQuery: true
1313
- // Enable removeQuery for NPM packages
1314
- }
1315
- });
1316
- }
1317
- async scrape(options, progressCallback, signal) {
1318
- await this.defaultStrategy.scrape(options, progressCallback, signal);
1319
- }
1320
- }
1321
- class PyPiScraperStrategy {
1322
- defaultStrategy;
1323
- canHandle(url) {
1324
- const { hostname } = new URL(url);
1325
- return ["pypi.org", "www.pypi.org"].includes(hostname);
1326
- }
1327
- constructor() {
1328
- this.defaultStrategy = new WebScraperStrategy({
1329
- urlNormalizerOptions: {
1330
- ignoreCase: true,
1331
- removeHash: true,
1332
- removeTrailingSlash: true,
1333
- removeQuery: true
1334
- // Enable removeQuery for PyPI packages
1335
- }
1336
- });
1337
- }
1338
- async scrape(options, progressCallback, signal) {
1339
- await this.defaultStrategy.scrape(options, progressCallback, signal);
1340
- }
1341
- }
1342
- class ScraperRegistry {
1343
- strategies;
1344
- constructor() {
1345
- this.strategies = [
1346
- new NpmScraperStrategy(),
1347
- new PyPiScraperStrategy(),
1348
- new GitHubScraperStrategy(),
1349
- new WebScraperStrategy(),
1350
- new LocalFileStrategy()
1351
- ];
1352
- }
1353
- getStrategy(url) {
1354
- validateUrl(url);
1355
- const strategy = this.strategies.find((s) => s.canHandle(url));
1356
- if (!strategy) {
1357
- throw new ScraperError(`No strategy found for URL: ${url}`);
1358
- }
1359
- return strategy;
1360
- }
1361
- }
1362
- class ScraperService {
1363
- registry;
1364
- constructor(registry) {
1365
- this.registry = registry;
1366
- }
1367
- /**
1368
- * Scrapes content from the provided URL using the appropriate strategy.
1369
- * Reports progress via callback and handles errors.
1370
- */
1371
- async scrape(options, progressCallback, signal) {
1372
- const strategy = this.registry.getStrategy(options.url);
1373
- if (!strategy) {
1374
- throw new ScraperError(`No scraper strategy found for URL: ${options.url}`, false);
1375
- }
1376
- await strategy.scrape(options, progressCallback, signal);
1377
- }
1378
- }
1379
- class PipelineWorker {
1380
- // Dependencies are passed in, making the worker stateless regarding specific jobs
1381
- store;
1382
- scraperService;
1383
- // Constructor accepts dependencies needed for execution
1384
- constructor(store, scraperService) {
1385
- this.store = store;
1386
- this.scraperService = scraperService;
1387
- }
1388
- /**
1389
- * Executes the given pipeline job.
1390
- * @param job - The job to execute.
1391
- * @param callbacks - Callbacks provided by the manager for reporting.
1392
- */
1393
- async executeJob(job, callbacks) {
1394
- const { id: jobId, library, version, options, abortController } = job;
1395
- const signal = abortController.signal;
1396
- logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
1397
- try {
1398
- await this.scraperService.scrape(
1399
- options,
1400
- async (progress) => {
1401
- if (signal.aborted) {
1402
- throw new CancellationError("Job cancelled during scraping progress");
1403
- }
1404
- job.progress = progress;
1405
- await callbacks.onJobProgress?.(job, progress);
1406
- if (progress.document) {
1407
- try {
1408
- await this.store.addDocument(library, version, {
1409
- pageContent: progress.document.content,
1410
- metadata: progress.document.metadata
1411
- });
1412
- logger.debug(
1413
- `[${jobId}] Stored document: ${progress.document.metadata.url}`
1414
- );
1415
- } catch (docError) {
1416
- logger.error(
1417
- `[${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`
1418
- );
1419
- await callbacks.onJobError?.(
1420
- job,
1421
- docError instanceof Error ? docError : new Error(String(docError)),
1422
- progress.document
1423
- );
1424
- }
1425
- }
1426
- },
1427
- signal
1428
- // Pass signal to scraper service
1429
- );
1430
- if (signal.aborted) {
1431
- throw new CancellationError("Job cancelled shortly after scraping finished");
1432
- }
1433
- logger.debug(`[${jobId}] Worker finished job successfully.`);
1434
- } catch (error) {
1435
- logger.warn(`[${jobId}] Worker encountered error: ${error}`);
1436
- throw error;
1437
- }
1438
- }
1439
- // --- Old methods removed ---
1440
- // process()
1441
- // stop()
1442
- // setCallbacks()
1443
- // handleScrapingProgress()
1444
- }
1445
- var PipelineJobStatus = /* @__PURE__ */ ((PipelineJobStatus2) => {
1446
- PipelineJobStatus2["QUEUED"] = "queued";
1447
- PipelineJobStatus2["RUNNING"] = "running";
1448
- PipelineJobStatus2["COMPLETED"] = "completed";
1449
- PipelineJobStatus2["FAILED"] = "failed";
1450
- PipelineJobStatus2["CANCELLING"] = "cancelling";
1451
- PipelineJobStatus2["CANCELLED"] = "cancelled";
1452
- return PipelineJobStatus2;
1453
- })(PipelineJobStatus || {});
1454
- const DEFAULT_CONCURRENCY = 3;
1455
- class PipelineManager {
1456
- jobMap = /* @__PURE__ */ new Map();
1457
- jobQueue = [];
1458
- activeWorkers = /* @__PURE__ */ new Set();
1459
- isRunning = false;
1460
- concurrency;
1461
- callbacks = {};
1462
- store;
1463
- scraperService;
1464
- constructor(store, concurrency = DEFAULT_CONCURRENCY) {
1465
- this.store = store;
1466
- this.concurrency = concurrency;
1467
- const registry = new ScraperRegistry();
1468
- this.scraperService = new ScraperService(registry);
1469
- }
1470
- /**
1471
- * Registers callback handlers for pipeline manager events.
1472
- */
1473
- setCallbacks(callbacks) {
1474
- this.callbacks = callbacks;
1475
- }
1476
- /**
1477
- * Starts the pipeline manager's worker processing.
1478
- */
1479
- async start() {
1480
- if (this.isRunning) {
1481
- logger.warn("PipelineManager is already running.");
1482
- return;
1483
- }
1484
- this.isRunning = true;
1485
- logger.debug(`PipelineManager started with concurrency ${this.concurrency}.`);
1486
- this._processQueue();
1487
- }
1488
- /**
1489
- * Stops the pipeline manager and attempts to gracefully shut down workers.
1490
- * Currently, it just stops processing new jobs. Cancellation of active jobs
1491
- * needs explicit `cancelJob` calls.
1492
- */
1493
- async stop() {
1494
- if (!this.isRunning) {
1495
- logger.warn("PipelineManager is not running.");
1496
- return;
1497
- }
1498
- this.isRunning = false;
1499
- logger.debug("PipelineManager stopping. No new jobs will be started.");
1500
- }
1501
- /**
1502
- * Enqueues a new document processing job.
1503
- */
1504
- async enqueueJob(library, version, options) {
1505
- const jobId = v4();
1506
- const abortController = new AbortController();
1507
- let resolveCompletion;
1508
- let rejectCompletion;
1509
- const completionPromise = new Promise((resolve, reject) => {
1510
- resolveCompletion = resolve;
1511
- rejectCompletion = reject;
1512
- });
1513
- const job = {
1514
- id: jobId,
1515
- library,
1516
- version,
1517
- options,
1518
- status: PipelineJobStatus.QUEUED,
1519
- progress: null,
1520
- error: null,
1521
- createdAt: /* @__PURE__ */ new Date(),
1522
- startedAt: null,
1523
- finishedAt: null,
1524
- abortController,
1525
- completionPromise,
1526
- resolveCompletion,
1527
- rejectCompletion
1528
- };
1529
- this.jobMap.set(jobId, job);
1530
- this.jobQueue.push(jobId);
1531
- logger.info(`📝 Job enqueued: ${jobId} for ${library}@${version}`);
1532
- await this.callbacks.onJobStatusChange?.(job);
1533
- if (this.isRunning) {
1534
- this._processQueue();
1535
- }
1536
- return jobId;
1537
- }
1538
- /**
1539
- * Retrieves the current state of a specific job.
1540
- */
1541
- async getJob(jobId) {
1542
- return this.jobMap.get(jobId);
1543
- }
1544
- /**
1545
- * Retrieves the current state of all jobs (or a subset based on status).
1546
- */
1547
- async getJobs(status) {
1548
- const allJobs = Array.from(this.jobMap.values());
1549
- if (status) {
1550
- return allJobs.filter((job) => job.status === status);
1551
- }
1552
- return allJobs;
1553
- }
1554
- /**
1555
- * Returns a promise that resolves when the specified job completes, fails, or is cancelled.
1556
- */
1557
- async waitForJobCompletion(jobId) {
1558
- const job = this.jobMap.get(jobId);
1559
- if (!job) {
1560
- throw new PipelineStateError(`Job not found: ${jobId}`);
1561
- }
1562
- await job.completionPromise;
1563
- }
1564
- /**
1565
- * Attempts to cancel a queued or running job.
1566
- */
1567
- async cancelJob(jobId) {
1568
- const job = this.jobMap.get(jobId);
1569
- if (!job) {
1570
- logger.warn(`Attempted to cancel non-existent job: ${jobId}`);
1571
- return;
1572
- }
1573
- switch (job.status) {
1574
- case PipelineJobStatus.QUEUED:
1575
- this.jobQueue = this.jobQueue.filter((id) => id !== jobId);
1576
- job.status = PipelineJobStatus.CANCELLED;
1577
- job.finishedAt = /* @__PURE__ */ new Date();
1578
- logger.info(`🚫 Job cancelled (was queued): ${jobId}`);
1579
- await this.callbacks.onJobStatusChange?.(job);
1580
- job.rejectCompletion(new PipelineStateError("Job cancelled before starting"));
1581
- break;
1582
- case PipelineJobStatus.RUNNING:
1583
- job.status = PipelineJobStatus.CANCELLING;
1584
- job.abortController.abort();
1585
- logger.info(`🚫 Signalling cancellation for running job: ${jobId}`);
1586
- await this.callbacks.onJobStatusChange?.(job);
1587
- break;
1588
- case PipelineJobStatus.COMPLETED:
1589
- case PipelineJobStatus.FAILED:
1590
- case PipelineJobStatus.CANCELLED:
1591
- case PipelineJobStatus.CANCELLING:
1592
- logger.warn(
1593
- `Job ${jobId} cannot be cancelled in its current state: ${job.status}`
1594
- );
1595
- break;
1596
- default:
1597
- logger.error(`Unhandled job status for cancellation: ${job.status}`);
1598
- break;
1599
- }
1600
- }
1601
- // --- Private Methods ---
1602
- /**
1603
- * Processes the job queue, starting new workers if capacity allows.
1604
- */
1605
- _processQueue() {
1606
- if (!this.isRunning) return;
1607
- while (this.activeWorkers.size < this.concurrency && this.jobQueue.length > 0) {
1608
- const jobId = this.jobQueue.shift();
1609
- if (!jobId) continue;
1610
- const job = this.jobMap.get(jobId);
1611
- if (!job || job.status !== PipelineJobStatus.QUEUED) {
1612
- logger.warn(`Skipping job ${jobId} in queue (not found or not queued).`);
1613
- continue;
1614
- }
1615
- this.activeWorkers.add(jobId);
1616
- job.status = PipelineJobStatus.RUNNING;
1617
- job.startedAt = /* @__PURE__ */ new Date();
1618
- this.callbacks.onJobStatusChange?.(job);
1619
- this._runJob(job).catch((error) => {
1620
- logger.error(`Unhandled error during job ${jobId} execution: ${error}`);
1621
- if (job.status !== PipelineJobStatus.FAILED && job.status !== PipelineJobStatus.CANCELLED) {
1622
- job.status = PipelineJobStatus.FAILED;
1623
- job.error = error instanceof Error ? error : new Error(String(error));
1624
- job.finishedAt = /* @__PURE__ */ new Date();
1625
- this.callbacks.onJobStatusChange?.(job);
1626
- job.rejectCompletion(job.error);
1627
- }
1628
- this.activeWorkers.delete(jobId);
1629
- this._processQueue();
1630
- });
1631
- }
1632
- }
1633
- /**
1634
- * Executes a single pipeline job by delegating to a PipelineWorker.
1635
- * Handles final status updates and promise resolution/rejection.
1636
- */
1637
- async _runJob(job) {
1638
- const { id: jobId, abortController } = job;
1639
- const signal = abortController.signal;
1640
- const worker = new PipelineWorker(this.store, this.scraperService);
1641
- try {
1642
- await worker.executeJob(job, this.callbacks);
1643
- if (signal.aborted) {
1644
- throw new CancellationError("Job cancelled just before completion");
1645
- }
1646
- job.status = PipelineJobStatus.COMPLETED;
1647
- job.finishedAt = /* @__PURE__ */ new Date();
1648
- await this.callbacks.onJobStatusChange?.(job);
1649
- job.resolveCompletion();
1650
- } catch (error) {
1651
- if (error instanceof CancellationError || signal.aborted) {
1652
- job.status = PipelineJobStatus.CANCELLED;
1653
- job.finishedAt = /* @__PURE__ */ new Date();
1654
- job.error = error instanceof CancellationError ? error : new CancellationError("Job cancelled by signal");
1655
- logger.info(`🚫 Job execution cancelled: ${jobId}: ${job.error.message}`);
1656
- await this.callbacks.onJobStatusChange?.(job);
1657
- job.rejectCompletion(job.error);
1658
- } else {
1659
- job.status = PipelineJobStatus.FAILED;
1660
- job.error = error instanceof Error ? error : new Error(String(error));
1661
- job.finishedAt = /* @__PURE__ */ new Date();
1662
- logger.error(`❌ Job failed: ${jobId}: ${job.error}`);
1663
- await this.callbacks.onJobStatusChange?.(job);
1664
- job.rejectCompletion(job.error);
1665
- }
1666
- } finally {
1667
- this.activeWorkers.delete(jobId);
1668
- this._processQueue();
1669
- }
1670
- }
1671
- }
1672
- const fullTrim = (str) => {
1673
- return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
1674
- };
1675
- class SplitterError extends Error {
1676
- }
1677
- class MinimumChunkSizeError extends SplitterError {
1678
- constructor(size, maxSize) {
1679
- super(
1680
- `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`
1681
- );
1682
- }
1683
- }
1684
- class ContentSplitterError extends SplitterError {
1685
- }
1686
- class CodeContentSplitter {
1687
- constructor(options) {
1688
- this.options = options;
1689
- }
1690
- async split(content) {
1691
- const language = content.match(/^```(\w+)\n/)?.[1];
1692
- const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
1693
- const lines = strippedContent.split("\n");
1694
- const chunks = [];
1695
- let currentChunkLines = [];
1696
- for (const line of lines) {
1697
- const singleLineSize = this.wrap(line, language).length;
1698
- if (singleLineSize > this.options.chunkSize) {
1699
- throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
1700
- }
1701
- currentChunkLines.push(line);
1702
- const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
1703
- const newChunkSize = newChunkContent.length;
1704
- if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
1705
- const lastLine = currentChunkLines.pop();
1706
- chunks.push(this.wrap(currentChunkLines.join("\n"), language));
1707
- currentChunkLines = [lastLine];
1708
- }
1709
- }
1710
- if (currentChunkLines.length > 0) {
1711
- chunks.push(this.wrap(currentChunkLines.join("\n"), language));
1712
- }
1713
- return chunks;
1714
- }
1715
- wrap(content, language) {
1716
- return `\`\`\`${language || ""}
1717
- ${content.replace(/\n+$/, "")}
1718
- \`\`\``;
1719
- }
1720
- }
1721
- class TableContentSplitter {
1722
- constructor(options) {
1723
- this.options = options;
1724
- }
1725
- /**
1726
- * Splits table content into chunks while preserving table structure
1727
- */
1728
- async split(content) {
1729
- const parsedTable = this.parseTable(content);
1730
- if (!parsedTable) {
1731
- return [content];
1732
- }
1733
- const { headers, rows } = parsedTable;
1734
- const chunks = [];
1735
- let currentRows = [];
1736
- for (const row of rows) {
1737
- const singleRowSize = this.wrap(row, headers).length;
1738
- if (singleRowSize > this.options.chunkSize) {
1739
- throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
1740
- }
1741
- const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
1742
- const newChunkSize = newChunkContent.length;
1743
- if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
1744
- chunks.push(this.wrap(currentRows.join("\n"), headers));
1745
- currentRows = [row];
1746
- } else {
1747
- currentRows.push(row);
1748
- }
1749
- }
1750
- if (currentRows.length > 0) {
1751
- chunks.push(this.wrap(currentRows.join("\n"), headers));
1752
- }
1753
- return chunks;
1754
- }
1755
- wrap(content, headers) {
1756
- const headerRow = `| ${headers.join(" | ")} |`;
1757
- const separatorRow = `|${headers.map(() => "---").join("|")}|`;
1758
- return [headerRow, separatorRow, content].join("\n");
1759
- }
1760
- parseTable(content) {
1761
- const lines = content.trim().split("\n");
1762
- if (lines.length < 3) return null;
1763
- const headers = this.parseRow(lines[0]);
1764
- if (!headers) return null;
1765
- const separator = lines[1];
1766
- if (!this.isValidSeparator(separator)) return null;
1767
- const rows = lines.slice(2).filter((row) => row.trim() !== "");
1768
- return { headers, separator, rows };
1769
- }
1770
- /**
1771
- * Parses a table row into cells
1772
- */
1773
- parseRow(row) {
1774
- if (!row.includes("|")) return null;
1775
- return row.split("|").map((cell) => cell.trim()).filter((cell) => cell !== "");
1776
- }
1777
- /**
1778
- * Validates the separator row of the table
1779
- */
1780
- isValidSeparator(separator) {
1781
- return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
1782
- }
1783
- }
1784
- class TextContentSplitter {
1785
- constructor(options) {
1786
- this.options = options;
1787
- }
1788
- /**
1789
- * Splits text content into chunks while trying to preserve semantic boundaries.
1790
- * Prefers paragraph breaks, then line breaks, finally falling back to word boundaries.
1791
- */
1792
- async split(content) {
1793
- const trimmedContent = fullTrim(content);
1794
- if (trimmedContent.length <= this.options.chunkSize) {
1795
- return [trimmedContent];
1796
- }
1797
- const words = trimmedContent.split(/\s+/);
1798
- const longestWord = words.reduce(
1799
- (max, word) => word.length > max.length ? word : max
1800
- );
1801
- if (longestWord.length > this.options.chunkSize) {
1802
- throw new MinimumChunkSizeError(longestWord.length, this.options.chunkSize);
1803
- }
1804
- const paragraphChunks = this.splitByParagraphs(trimmedContent);
1805
- if (this.areChunksValid(paragraphChunks)) {
1806
- return paragraphChunks;
1807
- }
1808
- const lineChunks = this.splitByLines(trimmedContent);
1809
- if (this.areChunksValid(lineChunks)) {
1810
- return this.mergeChunks(lineChunks, "\n");
1811
- }
1812
- const wordChunks = await this.splitByWords(trimmedContent);
1813
- return this.mergeChunks(wordChunks, " ");
1814
- }
1815
- /**
1816
- * Checks if all chunks are within the maximum size limit
1817
- */
1818
- areChunksValid(chunks) {
1819
- return chunks.every((chunk) => chunk.length <= this.options.chunkSize);
1820
- }
1821
- /**
1822
- * Splits text into chunks by paragraph boundaries (double newlines)
1823
- */
1824
- splitByParagraphs(text) {
1825
- const paragraphs = text.split(/\n\s*\n/).map((p) => fullTrim(p)).filter(Boolean);
1826
- return paragraphs.filter((chunk) => chunk.length > 2);
1827
- }
1828
- /**
1829
- * Splits text into chunks by line boundaries
1830
- */
1831
- splitByLines(text) {
1832
- const lines = text.split(/\n/).map((line) => fullTrim(line)).filter(Boolean);
1833
- return lines.filter((chunk) => chunk.length > 1);
1834
- }
1835
- /**
1836
- * Uses LangChain's recursive splitter for word-based splitting as a last resort
1837
- */
1838
- async splitByWords(text) {
1839
- const splitter = new RecursiveCharacterTextSplitter({
1840
- chunkSize: this.options.chunkSize,
1841
- chunkOverlap: 0
1842
- });
1843
- const chunks = await splitter.splitText(text);
1844
- return chunks;
1845
- }
1846
- /**
1847
- * Attempts to merge small chunks with previous chunks to minimize fragmentation.
1848
- * Only merges if combined size is within maxChunkSize.
1849
- */
1850
- mergeChunks(chunks, separator) {
1851
- const mergedChunks = [];
1852
- let currentChunk = null;
1853
- for (const chunk of chunks) {
1854
- if (currentChunk === null) {
1855
- currentChunk = chunk;
1856
- continue;
1857
- }
1858
- const currentChunkSize = this.getChunkSize(currentChunk);
1859
- const nextChunkSize = this.getChunkSize(chunk);
1860
- if (currentChunkSize + nextChunkSize + separator.length <= this.options.chunkSize) {
1861
- currentChunk = `${currentChunk}${separator}${chunk}`;
1862
- } else {
1863
- mergedChunks.push(currentChunk);
1864
- currentChunk = chunk;
1865
- }
1866
- }
1867
- if (currentChunk) {
1868
- mergedChunks.push(currentChunk);
1869
- }
1870
- return mergedChunks;
1871
- }
1872
- getChunkSize(chunk) {
1873
- return chunk.length;
1874
- }
1875
- wrap(content) {
1876
- return content;
1877
- }
1878
- }
1879
- class SemanticMarkdownSplitter {
1880
- constructor(preferredChunkSize, maxChunkSize) {
1881
- this.preferredChunkSize = preferredChunkSize;
1882
- this.maxChunkSize = maxChunkSize;
1883
- this.turndownService = new TurndownService({
1884
- headingStyle: "atx",
1885
- hr: "---",
1886
- bulletListMarker: "-",
1887
- codeBlockStyle: "fenced",
1888
- emDelimiter: "_",
1889
- strongDelimiter: "**",
1890
- linkStyle: "inlined"
1891
- });
1892
- this.turndownService.addRule("table", {
1893
- filter: ["table"],
1894
- replacement: (content, node) => {
1895
- const table = node;
1896
- const headers = Array.from(table.querySelectorAll("th")).map(
1897
- (th) => th.textContent?.trim() || ""
1898
- );
1899
- const rows = Array.from(table.querySelectorAll("tr")).filter(
1900
- (tr) => !tr.querySelector("th")
1901
- );
1902
- if (headers.length === 0 && rows.length === 0) return "";
1903
- let markdown = "\n";
1904
- if (headers.length > 0) {
1905
- markdown += `| ${headers.join(" | ")} |
1906
- `;
1907
- markdown += `|${headers.map(() => "---").join("|")}|
1908
- `;
1909
- }
1910
- for (const row of rows) {
1911
- const cells = Array.from(row.querySelectorAll("td")).map(
1912
- (td) => td.textContent?.trim() || ""
1913
- );
1914
- markdown += `| ${cells.join(" | ")} |
1915
- `;
1916
- }
1917
- return markdown;
1918
- }
1919
- });
1920
- this.textSplitter = new TextContentSplitter({
1921
- chunkSize: this.preferredChunkSize
1922
- });
1923
- this.codeSplitter = new CodeContentSplitter({
1924
- chunkSize: this.maxChunkSize
1925
- });
1926
- this.tableSplitter = new TableContentSplitter({
1927
- chunkSize: this.maxChunkSize
1928
- });
1929
- }
1930
- turndownService;
1931
- textSplitter;
1932
- codeSplitter;
1933
- tableSplitter;
1934
- /**
1935
- * Main entry point for splitting markdown content
1936
- */
1937
- async splitText(markdown) {
1938
- const html = await this.markdownToHtml(markdown);
1939
- const dom = await this.parseHtml(html);
1940
- const sections = await this.splitIntoSections(dom);
1941
- return this.splitSectionContent(sections);
1942
- }
1943
- /**
1944
- * Step 1: Split document into sections based on H1-H6 headings,
1945
- * as well as code blocks and tables.
1946
- */
1947
- async splitIntoSections(dom) {
1948
- const body = dom.querySelector("body");
1949
- if (!body) {
1950
- throw new Error("Invalid HTML structure: no body element found");
1951
- }
1952
- let currentSection = this.createRootSection();
1953
- const sections = [];
1954
- const stack = [currentSection];
1955
- for (const element of Array.from(body.children)) {
1956
- const headingMatch = element.tagName.match(/H([1-6])/);
1957
- if (headingMatch) {
1958
- const level = Number.parseInt(headingMatch[1], 10);
1959
- const title = fullTrim(element.textContent || "");
1960
- while (stack.length > 1 && stack[stack.length - 1].level >= level) {
1961
- stack.pop();
1962
- }
1963
- currentSection = {
1964
- level,
1965
- path: [
1966
- ...stack.slice(1).reduce((acc, s) => {
1967
- const lastPath = s.path[s.path.length - 1];
1968
- if (lastPath) acc.push(lastPath);
1969
- return acc;
1970
- }, []),
1971
- title
1972
- ],
1973
- content: [
1974
- {
1975
- type: "heading",
1976
- text: `${"#".repeat(level)} ${title}`
1977
- }
1978
- ]
1979
- };
1980
- sections.push(currentSection);
1981
- stack.push(currentSection);
1982
- } else if (element.tagName === "PRE") {
1983
- const code = element.querySelector("code");
1984
- const language = code?.className.replace("language-", "") || "";
1985
- const content = code?.textContent || element.textContent || "";
1986
- const markdown = `${"```"}${language}
1987
- ${content}
1988
- ${"```"}`;
1989
- currentSection = {
1990
- level: currentSection.level,
1991
- path: currentSection.path,
1992
- content: [
1993
- {
1994
- type: "code",
1995
- text: markdown
1996
- }
1997
- ]
1998
- };
1999
- sections.push(currentSection);
2000
- } else if (element.tagName === "TABLE") {
2001
- const markdown = fullTrim(this.turndownService.turndown(element.outerHTML));
2002
- currentSection = {
2003
- level: currentSection.level,
2004
- path: currentSection.path,
2005
- content: [
2006
- {
2007
- type: "table",
2008
- text: markdown
2009
- }
2010
- ]
2011
- };
2012
- sections.push(currentSection);
2013
- } else {
2014
- const markdown = fullTrim(this.turndownService.turndown(element.innerHTML));
2015
- if (markdown) {
2016
- currentSection = {
2017
- level: currentSection.level,
2018
- path: currentSection.path,
2019
- content: [
2020
- {
2021
- type: "text",
2022
- text: markdown
2023
- }
2024
- ]
2025
- };
2026
- sections.push(currentSection);
2027
- }
2028
- }
2029
- }
2030
- return sections;
2031
- }
2032
- /**
2033
- * Step 2: Split section content into smaller chunks
2034
- */
2035
- async splitSectionContent(sections) {
2036
- const chunks = [];
2037
- for (const section of sections) {
2038
- for (const content of section.content) {
2039
- let splitContent = [];
2040
- try {
2041
- switch (content.type) {
2042
- case "heading":
2043
- case "text": {
2044
- splitContent = await this.textSplitter.split(content.text);
2045
- break;
2046
- }
2047
- case "code": {
2048
- splitContent = await this.codeSplitter.split(content.text);
2049
- break;
2050
- }
2051
- case "table": {
2052
- splitContent = await this.tableSplitter.split(content.text);
2053
- break;
2054
- }
2055
- }
2056
- } catch (err) {
2057
- if (err instanceof MinimumChunkSizeError) {
2058
- logger.warn(
2059
- `⚠ Cannot split ${content.type} chunk normally, using RecursiveCharacterTextSplitter: ${err.message}`
2060
- );
2061
- const splitter = new RecursiveCharacterTextSplitter({
2062
- chunkSize: this.maxChunkSize,
2063
- chunkOverlap: Math.min(20, Math.floor(this.maxChunkSize * 0.1)),
2064
- // Use more aggressive separators including empty string as last resort
2065
- separators: [
2066
- "\n\n",
2067
- "\n",
2068
- " ",
2069
- " ",
2070
- ".",
2071
- ",",
2072
- ";",
2073
- ":",
2074
- "-",
2075
- "(",
2076
- ")",
2077
- "[",
2078
- "]",
2079
- "{",
2080
- "}",
2081
- ""
2082
- ]
2083
- });
2084
- const chunks2 = await splitter.splitText(content.text);
2085
- if (chunks2.length === 0) {
2086
- splitContent = [content.text.substring(0, this.maxChunkSize)];
2087
- } else {
2088
- splitContent = chunks2;
2089
- }
2090
- } else {
2091
- const errMessage = err instanceof Error ? err.message : String(err);
2092
- throw new ContentSplitterError(
2093
- `Failed to split ${content.type} content: ${errMessage}`
2094
- );
2095
- }
2096
- }
2097
- chunks.push(
2098
- ...splitContent.map(
2099
- (text) => ({
2100
- types: [content.type],
2101
- content: text,
2102
- section: {
2103
- level: section.level,
2104
- path: section.path
2105
- }
2106
- })
2107
- )
2108
- );
2109
- }
2110
- }
2111
- return chunks;
2112
- }
2113
- /**
2114
- * Helper to create the root section
2115
- */
2116
- createRootSection() {
2117
- return {
2118
- level: 0,
2119
- path: [],
2120
- content: []
2121
- };
2122
- }
2123
- /**
2124
- * Convert markdown to HTML using remark
2125
- */
2126
- async markdownToHtml(markdown) {
2127
- const html = await unified().use(remarkParse).use(remarkGfm).use(remarkHtml).process(markdown);
2128
- return `<!DOCTYPE html>
2129
- <html>
2130
- <body>
2131
- ${String(html)}
2132
- </body>
2133
- </html>`;
2134
- }
2135
- /**
2136
- * Parse HTML
2137
- */
2138
- async parseHtml(html) {
2139
- const { window } = createJSDOM(html);
2140
- return window.document;
2141
- }
2142
- }
2143
- class GreedySplitter {
2144
- baseSplitter;
2145
- minChunkSize;
2146
- preferredChunkSize;
2147
- /**
2148
- * Combines a base document splitter with size constraints to produce optimally-sized chunks.
2149
- * The base splitter handles the initial semantic splitting, while this class handles
2150
- * the concatenation strategy.
2151
- */
2152
- constructor(baseSplitter, minChunkSize, preferredChunkSize) {
2153
- this.baseSplitter = baseSplitter;
2154
- this.minChunkSize = minChunkSize;
2155
- this.preferredChunkSize = preferredChunkSize;
2156
- }
2157
- /**
2158
- * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks
2159
- * are combined until they reach the minimum size, but splits are preserved at major
2160
- * section boundaries to maintain document structure. This balances the need for
2161
- * context with semantic coherence.
2162
- */
2163
- async splitText(markdown) {
2164
- const initialChunks = await this.baseSplitter.splitText(markdown);
2165
- const concatenatedChunks = [];
2166
- let currentChunk = null;
2167
- for (const nextChunk of initialChunks) {
2168
- if (currentChunk) {
2169
- if (this.wouldExceedMaxSize(currentChunk, nextChunk)) {
2170
- concatenatedChunks.push(currentChunk);
2171
- currentChunk = this.cloneChunk(nextChunk);
2172
- continue;
2173
- }
2174
- if (currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk)) {
2175
- concatenatedChunks.push(currentChunk);
2176
- currentChunk = this.cloneChunk(nextChunk);
2177
- continue;
2178
- }
2179
- currentChunk.content += `
2180
- ${nextChunk.content}`;
2181
- currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk);
2182
- currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types);
2183
- } else {
2184
- currentChunk = this.cloneChunk(nextChunk);
2185
- }
2186
- }
2187
- if (currentChunk) {
2188
- concatenatedChunks.push(currentChunk);
2189
- }
2190
- return concatenatedChunks;
2191
- }
2192
- cloneChunk(chunk) {
2193
- return {
2194
- types: [...chunk.types],
2195
- content: chunk.content,
2196
- section: {
2197
- level: chunk.section.level,
2198
- path: [...chunk.section.path]
2199
- }
2200
- };
2201
- }
2202
- /**
2203
- * H1 and H2 headings represent major conceptual breaks in the document.
2204
- * Preserving these splits helps maintain the document's logical structure.
2205
- */
2206
- startsNewMajorSection(chunk) {
2207
- return chunk.section.level === 1 || chunk.section.level === 2;
2208
- }
2209
- /**
2210
- * Size limit check to ensure chunks remain within embedding model constraints.
2211
- * Essential for maintaining consistent embedding quality and avoiding truncation.
2212
- */
2213
- wouldExceedMaxSize(currentChunk, nextChunk) {
2214
- if (!currentChunk) {
2215
- return false;
2216
- }
2217
- return currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize;
2218
- }
2219
- /**
2220
- * Checks if one path is a prefix of another path, indicating a parent-child relationship
2221
- */
2222
- isPathIncluded(parentPath, childPath) {
2223
- if (parentPath.length >= childPath.length) return false;
2224
- return parentPath.every((part, i) => part === childPath[i]);
2225
- }
2226
- /**
2227
- * Merges section metadata when concatenating chunks, following these rules:
2228
- * 1. Level: Always uses the lowest (most general) level between chunks
2229
- * 2. Path selection:
2230
- * - For parent-child relationships (one path includes the other), uses the child's path
2231
- * - For siblings/unrelated sections, uses the common parent path
2232
- * - If no common path exists, uses the root path ([])
2233
- */
2234
- mergeSectionInfo(currentChunk, nextChunk) {
2235
- const level = Math.min(currentChunk.section.level, nextChunk.section.level);
2236
- if (currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i])) {
2237
- return currentChunk.section;
2238
- }
2239
- if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) {
2240
- return {
2241
- path: nextChunk.section.path,
2242
- level
2243
- };
2244
- }
2245
- if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) {
2246
- return {
2247
- path: currentChunk.section.path,
2248
- level
2249
- };
2250
- }
2251
- const commonPath = this.findCommonPrefix(
2252
- currentChunk.section.path,
2253
- nextChunk.section.path
2254
- );
2255
- return {
2256
- path: commonPath,
2257
- level
2258
- };
2259
- }
2260
- mergeTypes(currentTypes, nextTypes) {
2261
- return [.../* @__PURE__ */ new Set([...currentTypes, ...nextTypes])];
2262
- }
2263
- /**
2264
- * Returns longest common prefix between two paths
2265
- */
2266
- findCommonPrefix(path1, path2) {
2267
- const common = [];
2268
- for (let i = 0; i < Math.min(path1.length, path2.length); i++) {
2269
- if (path1[i] === path2[i]) {
2270
- common.push(path1[i]);
2271
- } else {
2272
- break;
2273
- }
2274
- }
2275
- return common;
2276
- }
2277
- }
2278
- class ToolError extends Error {
2279
- constructor(message, toolName) {
2280
- super(message);
2281
- this.toolName = toolName;
2282
- this.name = this.constructor.name;
2283
- }
2284
- }
2285
- class VersionNotFoundError extends ToolError {
2286
- constructor(library, requestedVersion, availableVersions) {
2287
- super(
2288
- `Version ${requestedVersion} not found for ${library}. Available versions: ${availableVersions.map((v) => v.version).join(", ")}`,
2289
- "SearchTool"
2290
- );
2291
- this.library = library;
2292
- this.requestedVersion = requestedVersion;
2293
- this.availableVersions = availableVersions;
2294
- }
2295
- getLatestVersion() {
2296
- return this.availableVersions.sort((a, b) => semver__default.compare(b.version, a.version))[0];
2297
- }
2298
- }
2299
- class LibraryNotFoundError extends ToolError {
2300
- constructor(requestedLibrary, suggestions = []) {
2301
- let message = `Library '${requestedLibrary}' not found.`;
2302
- if (suggestions.length > 0) {
2303
- message += ` Did you mean one of these: ${suggestions.join(", ")}?`;
2304
- }
2305
- super(message, "SearchTool");
2306
- this.requestedLibrary = requestedLibrary;
2307
- this.suggestions = suggestions;
2308
- }
2309
- }
2310
- class ListLibrariesTool {
2311
- docService;
2312
- constructor(docService) {
2313
- this.docService = docService;
2314
- }
2315
- async execute(options) {
2316
- const rawLibraries = await this.docService.listLibraries();
2317
- const libraries = rawLibraries.map(({ library, versions }) => ({
2318
- name: library,
2319
- versions
2320
- // Directly assign the detailed versions array
2321
- }));
2322
- return { libraries };
2323
- }
2324
- }
2325
- class ScrapeTool {
2326
- docService;
2327
- manager;
2328
- // Add manager property
2329
- constructor(docService, manager) {
2330
- this.docService = docService;
2331
- this.manager = manager;
2332
- }
2333
- async execute(options) {
2334
- const {
2335
- library,
2336
- version,
2337
- url,
2338
- options: scraperOptions,
2339
- waitForCompletion = true
2340
- } = options;
2341
- let internalVersion;
2342
- const partialVersionRegex = /^\d+(\.\d+)?$/;
2343
- if (version === null || version === void 0) {
2344
- internalVersion = "";
2345
- } else {
2346
- const validFullVersion = semver.valid(version);
2347
- if (validFullVersion) {
2348
- internalVersion = validFullVersion;
2349
- } else if (partialVersionRegex.test(version)) {
2350
- const coercedVersion = semver.coerce(version);
2351
- if (coercedVersion) {
2352
- internalVersion = coercedVersion.version;
2353
- } else {
2354
- throw new Error(
2355
- `Invalid version format for scraping: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2356
- );
2357
- }
2358
- } else {
2359
- throw new Error(
2360
- `Invalid version format for scraping: '${version}'. Use 'X.Y.Z', 'X.Y.Z-prerelease', 'X.Y', 'X', or omit.`
2361
- );
2362
- }
2363
- }
2364
- internalVersion = internalVersion.toLowerCase();
2365
- await this.docService.removeAllDocuments(library, internalVersion);
2366
- logger.info(
2367
- `💾 Cleared store for ${library}@${internalVersion || "[no version]"} before scraping.`
2368
- );
2369
- const manager = this.manager;
2370
- const jobId = await manager.enqueueJob(library, internalVersion, {
2371
- url,
2372
- library,
2373
- version: internalVersion,
2374
- scope: scraperOptions?.scope ?? "subpages",
2375
- followRedirects: scraperOptions?.followRedirects ?? true,
2376
- maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES$1,
2377
- maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH$1,
2378
- maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
2379
- ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2380
- scrapeMode: scraperOptions?.scrapeMode ?? ScrapeMode.Auto
2381
- // Pass scrapeMode enum
2382
- });
2383
- if (waitForCompletion) {
2384
- try {
2385
- await manager.waitForJobCompletion(jobId);
2386
- const finalJob = await manager.getJob(jobId);
2387
- const finalPagesScraped = finalJob?.progress?.pagesScraped ?? 0;
2388
- logger.debug(
2389
- `Job ${jobId} finished with status ${finalJob?.status}. Pages scraped: ${finalPagesScraped}`
2390
- );
2391
- return {
2392
- pagesScraped: finalPagesScraped
2393
- };
2394
- } catch (error) {
2395
- logger.error(`Job ${jobId} failed or was cancelled: ${error}`);
2396
- throw error;
2397
- }
2398
- }
2399
- return { jobId };
2400
- }
2401
- }
2402
- class SearchTool {
2403
- docService;
2404
- constructor(docService) {
2405
- this.docService = docService;
2406
- }
2407
- async execute(options) {
2408
- const { library, version, query, limit = 5, exactMatch = false } = options;
2409
- if (exactMatch && (!version || version === "latest")) {
2410
- await this.docService.validateLibraryExists(library);
2411
- const allLibraries = await this.docService.listLibraries();
2412
- const libraryInfo = allLibraries.find((lib) => lib.library === library);
2413
- const detailedVersions = libraryInfo ? libraryInfo.versions : [];
2414
- throw new VersionNotFoundError(
2415
- library,
2416
- "latest",
2417
- // Or perhaps the original 'version' if it wasn't 'latest'? Check logic.
2418
- detailedVersions
2419
- );
2420
- }
2421
- const resolvedVersion = version || "latest";
2422
- logger.info(
2423
- `🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`
2424
- );
2425
- try {
2426
- await this.docService.validateLibraryExists(library);
2427
- let versionToSearch = resolvedVersion;
2428
- if (!exactMatch) {
2429
- const versionResult = await this.docService.findBestVersion(library, version);
2430
- versionToSearch = versionResult.bestMatch;
2431
- }
2432
- const results = await this.docService.searchStore(
2433
- library,
2434
- versionToSearch,
2435
- query,
2436
- limit
2437
- );
2438
- logger.info(`✅ Found ${results.length} matching results`);
2439
- return { results };
2440
- } catch (error) {
2441
- logger.error(
2442
- `❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
2443
- );
2444
- throw error;
2445
- }
2446
- }
2447
- }
2448
- let projectRoot = null;
2449
- function getProjectRoot() {
2450
- if (projectRoot) {
2451
- return projectRoot;
2452
- }
2453
- const currentFilePath = fileURLToPath(import.meta.url);
2454
- let currentDir = path.dirname(currentFilePath);
2455
- while (true) {
2456
- const packageJsonPath = path.join(currentDir, "package.json");
2457
- if (fs$1.existsSync(packageJsonPath)) {
2458
- projectRoot = currentDir;
2459
- return projectRoot;
2460
- }
2461
- const parentDir = path.dirname(currentDir);
2462
- if (parentDir === currentDir) {
2463
- throw new Error("Could not find project root containing package.json.");
2464
- }
2465
- currentDir = parentDir;
2466
- }
2467
- }
2468
- const CHILD_LIMIT = 5;
2469
- const SIBLING_LIMIT = 2;
2470
- class DocumentRetrieverService {
2471
- documentStore;
2472
- constructor(documentStore) {
2473
- this.documentStore = documentStore;
2474
- }
2475
- /**
2476
- * Collects all related chunk IDs for a given initial hit.
2477
- * Returns an object with url, hitId, relatedIds (Set), and score.
2478
- */
2479
- async getRelatedChunkIds(library, version, doc, siblingLimit = SIBLING_LIMIT, childLimit = CHILD_LIMIT) {
2480
- const id = doc.id;
2481
- const url = doc.metadata.url;
2482
- const score = doc.metadata.score;
2483
- const relatedIds = /* @__PURE__ */ new Set();
2484
- relatedIds.add(id);
2485
- const parent = await this.documentStore.findParentChunk(library, version, id);
2486
- if (parent) {
2487
- relatedIds.add(parent.id);
2488
- }
2489
- const precedingSiblings = await this.documentStore.findPrecedingSiblingChunks(
2490
- library,
2491
- version,
2492
- id,
2493
- siblingLimit
2494
- );
2495
- for (const sib of precedingSiblings) {
2496
- relatedIds.add(sib.id);
2497
- }
2498
- const childChunks = await this.documentStore.findChildChunks(
2499
- library,
2500
- version,
2501
- id,
2502
- childLimit
2503
- );
2504
- for (const child of childChunks) {
2505
- relatedIds.add(child.id);
2506
- }
2507
- const subsequentSiblings = await this.documentStore.findSubsequentSiblingChunks(
2508
- library,
2509
- version,
2510
- id,
2511
- siblingLimit
2512
- );
2513
- for (const sib of subsequentSiblings) {
2514
- relatedIds.add(sib.id);
2515
- }
2516
- return { url, hitId: id, relatedIds, score };
2517
- }
2518
- /**
2519
- * Groups related chunk info by URL, deduplicates IDs, and finds max score per URL.
2520
- */
2521
- groupAndPrepareFetch(relatedInfos) {
2522
- const urlMap = /* @__PURE__ */ new Map();
2523
- for (const info of relatedInfos) {
2524
- let entry = urlMap.get(info.url);
2525
- if (!entry) {
2526
- entry = { uniqueChunkIds: /* @__PURE__ */ new Set(), maxScore: info.score };
2527
- urlMap.set(info.url, entry);
2528
- }
2529
- for (const id of info.relatedIds) {
2530
- entry.uniqueChunkIds.add(id);
2531
- }
2532
- if (info.score > entry.maxScore) {
2533
- entry.maxScore = info.score;
2534
- }
2535
- }
2536
- return urlMap;
2537
- }
2538
- /**
2539
- * Finalizes the merged result for a URL group by fetching, sorting, and joining content.
2540
- */
2541
- async finalizeResult(library, version, url, uniqueChunkIds, maxScore) {
2542
- const ids = Array.from(uniqueChunkIds);
2543
- const docs = await this.documentStore.findChunksByIds(library, version, ids);
2544
- const content = docs.map((d) => d.pageContent).join("\n\n");
2545
- return {
2546
- url,
2547
- content,
2548
- score: maxScore
2549
- };
2550
- }
2551
- /**
2552
- * Searches for documents and expands the context around the matches.
2553
- * @param library The library name.
2554
- * @param version The library version.
2555
- * @param query The search query.
2556
- * @param version The library version (optional, defaults to searching documents without a version).
2557
- * @param query The search query.
2558
- * @param limit The optional limit for the initial search results.
2559
- * @returns An array of strings representing the aggregated content of the retrieved chunks.
2560
- */
2561
- async search(library, version, query, limit) {
2562
- const normalizedVersion = (version ?? "").toLowerCase();
2563
- const initialResults = await this.documentStore.findByContent(
2564
- library,
2565
- normalizedVersion,
2566
- query,
2567
- limit ?? 10
2568
- );
2569
- const relatedInfos = await Promise.all(
2570
- initialResults.map(
2571
- (doc) => this.getRelatedChunkIds(library, normalizedVersion, doc)
2572
- )
2573
- );
2574
- const urlMap = this.groupAndPrepareFetch(relatedInfos);
2575
- const results = [];
2576
- for (const [url, { uniqueChunkIds, maxScore }] of urlMap.entries()) {
2577
- const result = await this.finalizeResult(
2578
- library,
2579
- normalizedVersion,
2580
- url,
2581
- uniqueChunkIds,
2582
- maxScore
2583
- );
2584
- results.push(result);
2585
- }
2586
- return results;
2587
- }
2588
- }
2589
- class StoreError extends Error {
2590
- constructor(message, cause) {
2591
- super(cause ? `${message} caused by ${cause}` : message);
2592
- this.cause = cause;
2593
- this.name = this.constructor.name;
2594
- const causeError = cause instanceof Error ? cause : cause ? new Error(String(cause)) : void 0;
2595
- if (causeError?.stack) {
2596
- this.stack = causeError.stack;
2597
- }
2598
- }
2599
- }
2600
- class DimensionError extends StoreError {
2601
- constructor(modelName, modelDimension, dbDimension) {
2602
- super(
2603
- `Model "${modelName}" produces ${modelDimension}-dimensional vectors, which exceeds the database's fixed dimension of ${dbDimension}. Please use a model with dimension ≤ ${dbDimension}.`
2604
- );
2605
- this.modelName = modelName;
2606
- this.modelDimension = modelDimension;
2607
- this.dbDimension = dbDimension;
2608
- }
2609
- }
2610
- class ConnectionError extends StoreError {
2611
- }
2612
- const MIGRATIONS_DIR = path.join(getProjectRoot(), "db", "migrations");
2613
- const MIGRATIONS_TABLE = "_schema_migrations";
2614
- function ensureMigrationsTable(db) {
2615
- db.exec(`
2616
- CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} (
2617
- id TEXT PRIMARY KEY,
2618
- applied_at DATETIME DEFAULT CURRENT_TIMESTAMP
2619
- );
2620
- `);
2621
- }
2622
- function getAppliedMigrations(db) {
2623
- const stmt = db.prepare(`SELECT id FROM ${MIGRATIONS_TABLE}`);
2624
- const rows = stmt.all();
2625
- return new Set(rows.map((row) => row.id));
2626
- }
2627
- function applyMigrations(db) {
2628
- try {
2629
- logger.debug("Applying database migrations...");
2630
- ensureMigrationsTable(db);
2631
- const appliedMigrations = getAppliedMigrations(db);
2632
- if (!fs$1.existsSync(MIGRATIONS_DIR)) {
2633
- throw new StoreError("Migrations directory not found");
2634
- }
2635
- const migrationFiles = fs$1.readdirSync(MIGRATIONS_DIR).filter((file) => file.endsWith(".sql")).sort();
2636
- let appliedCount = 0;
2637
- for (const filename of migrationFiles) {
2638
- if (!appliedMigrations.has(filename)) {
2639
- logger.debug(`Applying migration: ${filename}`);
2640
- const filePath = path.join(MIGRATIONS_DIR, filename);
2641
- const sql = fs$1.readFileSync(filePath, "utf8");
2642
- const transaction = db.transaction(() => {
2643
- db.exec(sql);
2644
- const insertStmt = db.prepare(
2645
- `INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES (?)`
2646
- );
2647
- insertStmt.run(filename);
2648
- });
2649
- try {
2650
- transaction();
2651
- logger.debug(`Successfully applied migration: ${filename}`);
2652
- appliedCount++;
2653
- } catch (error) {
2654
- logger.error(`Failed to apply migration: ${filename} - ${error}`);
2655
- throw new StoreError(`Migration failed: ${filename} - ${error}`);
2656
- }
2657
- }
2658
- }
2659
- if (appliedCount > 0) {
2660
- logger.debug(`Applied ${appliedCount} new migration(s).`);
2661
- } else {
2662
- logger.debug("Database schema is up to date.");
2663
- }
2664
- } catch (error) {
2665
- if (error instanceof StoreError) {
2666
- throw error;
2667
- }
2668
- throw new StoreError("Failed during migration process", error);
2669
- }
2670
- }
2671
- const VECTOR_DIMENSION = 1536;
2672
- function mapDbDocumentToDocument(doc) {
2673
- return {
2674
- id: doc.id,
2675
- pageContent: doc.content,
2676
- metadata: JSON.parse(doc.metadata)
2677
- };
2678
- }
2679
- class DocumentStore {
2680
- db;
2681
- embeddings;
2682
- dbDimension = VECTOR_DIMENSION;
2683
- modelDimension;
2684
- statements;
2685
- /**
2686
- * Calculates Reciprocal Rank Fusion score for a result
2687
- */
2688
- calculateRRF(vecRank, ftsRank, k = 60) {
2689
- let rrf = 0;
2690
- if (vecRank !== void 0) {
2691
- rrf += 1 / (k + vecRank);
2692
- }
2693
- if (ftsRank !== void 0) {
2694
- rrf += 1 / (k + ftsRank);
2695
- }
2696
- return rrf;
2697
- }
2698
- /**
2699
- * Assigns ranks to search results based on their scores
2700
- */
2701
- assignRanks(results) {
2702
- const vecRanks = /* @__PURE__ */ new Map();
2703
- const ftsRanks = /* @__PURE__ */ new Map();
2704
- results.filter((r) => r.vec_score !== void 0).sort((a, b) => (a.vec_score ?? 0) - (b.vec_score ?? 0)).forEach((result, index) => {
2705
- vecRanks.set(Number(result.id), index + 1);
2706
- });
2707
- results.filter((r) => r.fts_score !== void 0).sort((a, b) => (a.fts_score ?? 0) - (b.fts_score ?? 0)).forEach((result, index) => {
2708
- ftsRanks.set(Number(result.id), index + 1);
2709
- });
2710
- return results.map((result) => ({
2711
- ...result,
2712
- vec_rank: vecRanks.get(Number(result.id)),
2713
- fts_rank: ftsRanks.get(Number(result.id)),
2714
- rrf_score: this.calculateRRF(
2715
- vecRanks.get(Number(result.id)),
2716
- ftsRanks.get(Number(result.id))
2717
- )
2718
- }));
2719
- }
2720
- constructor(dbPath) {
2721
- if (!dbPath) {
2722
- throw new StoreError("Missing required database path");
2723
- }
2724
- this.db = new Database(dbPath);
2725
- }
2726
- /**
2727
- * Sets up prepared statements for database queries
2728
- */
2729
- prepareStatements() {
2730
- const statements = {
2731
- getById: this.db.prepare("SELECT * FROM documents WHERE id = ?"),
2732
- insertDocument: this.db.prepare(
2733
- "INSERT INTO documents (library, version, url, content, metadata, sort_order, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?)"
2734
- // Added indexed_at
2735
- ),
2736
- insertEmbedding: this.db.prepare(
2737
- "INSERT INTO documents_vec (rowid, library, version, embedding) VALUES (?, ?, ?, ?)"
2738
- ),
2739
- deleteDocuments: this.db.prepare(
2740
- "DELETE FROM documents WHERE library = ? AND version = ?"
2741
- ),
2742
- queryVersions: this.db.prepare(
2743
- "SELECT DISTINCT version FROM documents WHERE library = ? ORDER BY version"
2744
- ),
2745
- checkExists: this.db.prepare(
2746
- "SELECT id FROM documents WHERE library = ? AND version = ? LIMIT 1"
2747
- ),
2748
- queryLibraryVersions: this.db.prepare(
2749
- `SELECT
2750
- library,
2751
- version,
2752
- COUNT(*) as documentCount,
2753
- COUNT(DISTINCT url) as uniqueUrlCount,
2754
- MIN(indexed_at) as indexedAt
2755
- FROM documents
2756
- GROUP BY library, version
2757
- ORDER BY library, version`
2758
- ),
2759
- getChildChunks: this.db.prepare(`
2760
- SELECT * FROM documents
2761
- WHERE library = ?
2762
- AND version = ?
2763
- AND url = ?
2764
- AND json_array_length(json_extract(metadata, '$.path')) = ?
2765
- AND json_extract(metadata, '$.path') LIKE ? || '%'
2766
- AND sort_order > (SELECT sort_order FROM documents WHERE id = ?)
2767
- ORDER BY sort_order
2768
- LIMIT ?
2769
- `),
2770
- getPrecedingSiblings: this.db.prepare(`
2771
- SELECT * FROM documents
2772
- WHERE library = ?
2773
- AND version = ?
2774
- AND url = ?
2775
- AND sort_order < (SELECT sort_order FROM documents WHERE id = ?)
2776
- AND json_extract(metadata, '$.path') = ?
2777
- ORDER BY sort_order DESC
2778
- LIMIT ?
2779
- `),
2780
- getSubsequentSiblings: this.db.prepare(`
2781
- SELECT * FROM documents
2782
- WHERE library = ?
2783
- AND version = ?
2784
- AND url = ?
2785
- AND sort_order > (SELECT sort_order FROM documents WHERE id = ?)
2786
- AND json_extract(metadata, '$.path') = ?
2787
- ORDER BY sort_order
2788
- LIMIT ?
2789
- `),
2790
- getParentChunk: this.db.prepare(`
2791
- SELECT * FROM documents
2792
- WHERE library = ?
2793
- AND version = ?
2794
- AND url = ?
2795
- AND json_extract(metadata, '$.path') = ?
2796
- AND sort_order < (SELECT sort_order FROM documents WHERE id = ?)
2797
- ORDER BY sort_order DESC
2798
- LIMIT 1
2799
- `)
2800
- };
2801
- this.statements = statements;
2802
- }
2803
- /**
2804
- * Pads a vector to the fixed database dimension by appending zeros.
2805
- * Throws an error if the input vector is longer than the database dimension.
2806
- */
2807
- padVector(vector) {
2808
- if (vector.length > this.dbDimension) {
2809
- throw new Error(
2810
- `Vector dimension ${vector.length} exceeds database dimension ${this.dbDimension}`
2811
- );
2812
- }
2813
- if (vector.length === this.dbDimension) {
2814
- return vector;
2815
- }
2816
- return [...vector, ...new Array(this.dbDimension - vector.length).fill(0)];
2817
- }
2818
- /**
2819
- * Initializes embeddings client using environment variables for configuration.
2820
- *
2821
- * The embedding model is configured using DOCS_MCP_EMBEDDING_MODEL environment variable.
2822
- * Format: "provider:model_name" (e.g., "google:text-embedding-004") or just "model_name"
2823
- * for OpenAI (default).
2824
- *
2825
- * Supported providers and their required environment variables:
2826
- * - openai: OPENAI_API_KEY (and optionally OPENAI_API_BASE, OPENAI_ORG_ID)
2827
- * - google: GOOGLE_APPLICATION_CREDENTIALS (path to service account JSON)
2828
- * - aws: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION (or BEDROCK_AWS_REGION)
2829
- * - microsoft: Azure OpenAI credentials (AZURE_OPENAI_API_*)
2830
- */
2831
- async initializeEmbeddings() {
2832
- const modelSpec = process.env.DOCS_MCP_EMBEDDING_MODEL || "text-embedding-3-small";
2833
- const { createEmbeddingModel } = await import("./EmbeddingFactory-0Z5e_g1J.js");
2834
- this.embeddings = createEmbeddingModel(modelSpec);
2835
- const testVector = await this.embeddings.embedQuery("test");
2836
- this.modelDimension = testVector.length;
2837
- if (this.modelDimension > this.dbDimension) {
2838
- throw new DimensionError(modelSpec, this.modelDimension, this.dbDimension);
2839
- }
2840
- }
2841
- /**
2842
- * Escapes a query string for use with SQLite FTS5 MATCH operator.
2843
- * Wraps the query in double quotes and escapes internal double quotes.
2844
- */
2845
- escapeFtsQuery(query) {
2846
- const escapedQuotes = query.replace(/"/g, '""');
2847
- return `"${escapedQuotes}"`;
2848
- }
2849
- /**
2850
- * Initializes database connection and ensures readiness
2851
- */
2852
- async initialize() {
2853
- try {
2854
- sqliteVec.load(this.db);
2855
- applyMigrations(this.db);
2856
- this.prepareStatements();
2857
- await this.initializeEmbeddings();
2858
- } catch (error) {
2859
- if (error instanceof StoreError) {
2860
- throw error;
2861
- }
2862
- throw new ConnectionError("Failed to initialize database connection", error);
2863
- }
2864
- }
2865
- /**
2866
- * Gracefully closes database connections
2867
- */
2868
- async shutdown() {
2869
- this.db.close();
2870
- }
2871
- /**
2872
- * Retrieves all unique versions for a specific library
2873
- */
2874
- async queryUniqueVersions(library) {
2875
- try {
2876
- const rows = this.statements.queryVersions.all(library.toLowerCase());
2877
- return rows.map((row) => row.version);
2878
- } catch (error) {
2879
- throw new ConnectionError("Failed to query versions", error);
2880
- }
2881
- }
2882
- /**
2883
- * Verifies existence of documents for a specific library version
2884
- */
2885
- async checkDocumentExists(library, version) {
2886
- try {
2887
- const result = this.statements.checkExists.get(
2888
- library.toLowerCase(),
2889
- version.toLowerCase()
2890
- );
2891
- return result !== void 0;
2892
- } catch (error) {
2893
- throw new ConnectionError("Failed to check document existence", error);
2894
- }
2895
- }
2896
- /**
2897
- * Retrieves a mapping of all libraries to their available versions with details.
2898
- */
2899
- async queryLibraryVersions() {
2900
- try {
2901
- const rows = this.statements.queryLibraryVersions.all();
2902
- const libraryMap = /* @__PURE__ */ new Map();
2903
- for (const row of rows) {
2904
- const library = row.library;
2905
- if (!libraryMap.has(library)) {
2906
- libraryMap.set(library, []);
2907
- }
2908
- const indexedAtISO = row.indexedAt ? new Date(row.indexedAt).toISOString() : null;
2909
- libraryMap.get(library)?.push({
2910
- version: row.version,
2911
- documentCount: row.documentCount,
2912
- uniqueUrlCount: row.uniqueUrlCount,
2913
- indexedAt: indexedAtISO
2914
- });
2915
- }
2916
- for (const versions of libraryMap.values()) {
2917
- versions.sort((a, b) => {
2918
- if (a.version === "" && b.version !== "") {
2919
- return -1;
2920
- }
2921
- if (a.version !== "" && b.version === "") {
2922
- return 1;
2923
- }
2924
- if (a.version === "" && b.version === "") {
2925
- return 0;
2926
- }
2927
- return semver__default.compare(a.version, b.version);
2928
- });
2929
- }
2930
- return libraryMap;
2931
- } catch (error) {
2932
- throw new ConnectionError("Failed to query library versions", error);
2933
- }
2934
- }
2935
- /**
2936
- * Stores documents with library and version metadata, generating embeddings
2937
- * for vector similarity search
2938
- */
2939
- async addDocuments(library, version, documents) {
2940
- try {
2941
- const texts = documents.map((doc) => {
2942
- const header = `<title>${doc.metadata.title}</title>
2943
- <url>${doc.metadata.url}</url>
2944
- <path>${doc.metadata.path.join(" / ")}</path>
2945
- `;
2946
- return `${header}${doc.pageContent}`;
2947
- });
2948
- const rawEmbeddings = [];
2949
- for (let i = 0; i < texts.length; i += EMBEDDING_BATCH_SIZE) {
2950
- const batchTexts = texts.slice(i, i + EMBEDDING_BATCH_SIZE);
2951
- const batchEmbeddings = await this.embeddings.embedDocuments(batchTexts);
2952
- rawEmbeddings.push(...batchEmbeddings);
2953
- }
2954
- const paddedEmbeddings = rawEmbeddings.map((vector) => this.padVector(vector));
2955
- const transaction = this.db.transaction((docs) => {
2956
- for (let i = 0; i < docs.length; i++) {
2957
- const doc = docs[i];
2958
- const url = doc.metadata.url;
2959
- if (!url || typeof url !== "string" || !url.trim()) {
2960
- throw new StoreError("Document metadata must include a valid URL");
2961
- }
2962
- const result = this.statements.insertDocument.run(
2963
- library.toLowerCase(),
2964
- version.toLowerCase(),
2965
- url,
2966
- doc.pageContent,
2967
- JSON.stringify(doc.metadata),
2968
- i,
2969
- (/* @__PURE__ */ new Date()).toISOString()
2970
- // Pass current timestamp for indexed_at
2971
- );
2972
- const rowId = result.lastInsertRowid;
2973
- this.statements.insertEmbedding.run(
2974
- BigInt(rowId),
2975
- library.toLowerCase(),
2976
- version.toLowerCase(),
2977
- JSON.stringify(paddedEmbeddings[i])
2978
- );
2979
- }
2980
- });
2981
- transaction(documents);
2982
- } catch (error) {
2983
- throw new ConnectionError("Failed to add documents to store", error);
2984
- }
2985
- }
2986
- /**
2987
- * Removes documents matching specified library and version
2988
- * @returns Number of documents deleted
2989
- */
2990
- async deleteDocuments(library, version) {
2991
- try {
2992
- const result = this.statements.deleteDocuments.run(
2993
- library.toLowerCase(),
2994
- version.toLowerCase()
2995
- );
2996
- return result.changes;
2997
- } catch (error) {
2998
- throw new ConnectionError("Failed to delete documents", error);
2999
- }
3000
- }
3001
- /**
3002
- * Retrieves a document by its ID.
3003
- * @param id The ID of the document.
3004
- * @returns The document, or null if not found.
3005
- */
3006
- async getById(id) {
3007
- try {
3008
- const row = this.statements.getById.get(id);
3009
- if (!row) {
3010
- return null;
3011
- }
3012
- return mapDbDocumentToDocument(row);
3013
- } catch (error) {
3014
- throw new ConnectionError(`Failed to get document by ID ${id}`, error);
3015
- }
3016
- }
3017
- /**
3018
- * Finds documents matching a text query using hybrid search.
3019
- * Combines vector similarity search with full-text search using Reciprocal Rank Fusion.
3020
- */
3021
- async findByContent(library, version, query, limit) {
3022
- try {
3023
- const rawEmbedding = await this.embeddings.embedQuery(query);
3024
- const embedding = this.padVector(rawEmbedding);
3025
- const ftsQuery = this.escapeFtsQuery(query);
3026
- const stmt = this.db.prepare(`
3027
- WITH vec_scores AS (
3028
- SELECT
3029
- rowid as id,
3030
- distance as vec_score
3031
- FROM documents_vec
3032
- WHERE library = ?
3033
- AND version = ?
3034
- AND embedding MATCH ?
3035
- ORDER BY vec_score
3036
- LIMIT ?
3037
- ),
3038
- fts_scores AS (
3039
- SELECT
3040
- f.rowid as id,
3041
- bm25(documents_fts, 10.0, 1.0, 5.0, 1.0) as fts_score
3042
- FROM documents_fts f
3043
- JOIN documents d ON f.rowid = d.rowid
3044
- WHERE d.library = ?
3045
- AND d.version = ?
3046
- AND documents_fts MATCH ?
3047
- ORDER BY fts_score
3048
- LIMIT ?
3049
- )
3050
- SELECT
3051
- d.id,
3052
- d.content,
3053
- d.metadata,
3054
- COALESCE(1 / (1 + v.vec_score), 0) as vec_score,
3055
- COALESCE(1 / (1 + f.fts_score), 0) as fts_score
3056
- FROM documents d
3057
- LEFT JOIN vec_scores v ON d.id = v.id
3058
- LEFT JOIN fts_scores f ON d.id = f.id
3059
- WHERE v.id IS NOT NULL OR f.id IS NOT NULL
3060
- `);
3061
- const rawResults = stmt.all(
3062
- library.toLowerCase(),
3063
- version.toLowerCase(),
3064
- JSON.stringify(embedding),
3065
- limit,
3066
- library.toLowerCase(),
3067
- version.toLowerCase(),
3068
- ftsQuery,
3069
- // Use the escaped query
3070
- limit
3071
- );
3072
- const rankedResults = this.assignRanks(rawResults);
3073
- const topResults = rankedResults.sort((a, b) => b.rrf_score - a.rrf_score).slice(0, limit);
3074
- return topResults.map((row) => ({
3075
- ...mapDbDocumentToDocument(row),
3076
- metadata: {
3077
- ...JSON.parse(row.metadata),
3078
- score: row.rrf_score,
3079
- vec_rank: row.vec_rank,
3080
- fts_rank: row.fts_rank
3081
- }
3082
- }));
3083
- } catch (error) {
3084
- throw new ConnectionError(
3085
- `Failed to find documents by content with query "${query}"`,
3086
- error
3087
- );
3088
- }
3089
- }
3090
- /**
3091
- * Finds child chunks of a given document based on path hierarchy.
3092
- */
3093
- async findChildChunks(library, version, id, limit) {
3094
- try {
3095
- const parent = await this.getById(id);
3096
- if (!parent) {
3097
- return [];
3098
- }
3099
- const parentPath = parent.metadata.path ?? [];
3100
- const parentUrl = parent.metadata.url;
3101
- const result = this.statements.getChildChunks.all(
3102
- library.toLowerCase(),
3103
- version.toLowerCase(),
3104
- parentUrl,
3105
- parentPath.length + 1,
3106
- JSON.stringify(parentPath),
3107
- id,
3108
- limit
3109
- );
3110
- return result.map((row) => mapDbDocumentToDocument(row));
3111
- } catch (error) {
3112
- throw new ConnectionError(`Failed to find child chunks for ID ${id}`, error);
3113
- }
3114
- }
3115
- /**
3116
- * Finds preceding sibling chunks of a given document.
3117
- */
3118
- async findPrecedingSiblingChunks(library, version, id, limit) {
3119
- try {
3120
- const reference = await this.getById(id);
3121
- if (!reference) {
3122
- return [];
3123
- }
3124
- const refMetadata = reference.metadata;
3125
- const result = this.statements.getPrecedingSiblings.all(
3126
- library.toLowerCase(),
3127
- version.toLowerCase(),
3128
- refMetadata.url,
3129
- id,
3130
- JSON.stringify(refMetadata.path),
3131
- limit
3132
- );
3133
- return result.reverse().map((row) => mapDbDocumentToDocument(row));
3134
- } catch (error) {
3135
- throw new ConnectionError(
3136
- `Failed to find preceding sibling chunks for ID ${id}`,
3137
- error
3138
- );
3139
- }
3140
- }
3141
- /**
3142
- * Finds subsequent sibling chunks of a given document.
3143
- */
3144
- async findSubsequentSiblingChunks(library, version, id, limit) {
3145
- try {
3146
- const reference = await this.getById(id);
3147
- if (!reference) {
3148
- return [];
3149
- }
3150
- const refMetadata = reference.metadata;
3151
- const result = this.statements.getSubsequentSiblings.all(
3152
- library.toLowerCase(),
3153
- version.toLowerCase(),
3154
- refMetadata.url,
3155
- id,
3156
- JSON.stringify(refMetadata.path),
3157
- limit
3158
- );
3159
- return result.map((row) => mapDbDocumentToDocument(row));
3160
- } catch (error) {
3161
- throw new ConnectionError(
3162
- `Failed to find subsequent sibling chunks for ID ${id}`,
3163
- error
3164
- );
3165
- }
3166
- }
3167
- /**
3168
- * Finds the parent chunk of a given document.
3169
- */
3170
- async findParentChunk(library, version, id) {
3171
- try {
3172
- const child = await this.getById(id);
3173
- if (!child) {
3174
- return null;
3175
- }
3176
- const childMetadata = child.metadata;
3177
- const path2 = childMetadata.path ?? [];
3178
- const parentPath = path2.slice(0, -1);
3179
- if (parentPath.length === 0) {
3180
- return null;
3181
- }
3182
- const result = this.statements.getParentChunk.get(
3183
- library.toLowerCase(),
3184
- version.toLowerCase(),
3185
- childMetadata.url,
3186
- JSON.stringify(parentPath),
3187
- id
3188
- );
3189
- if (!result) {
3190
- return null;
3191
- }
3192
- return mapDbDocumentToDocument(result);
3193
- } catch (error) {
3194
- throw new ConnectionError(`Failed to find parent chunk for ID ${id}`, error);
3195
- }
3196
- }
3197
- /**
3198
- * Fetches multiple documents by their IDs in a single call.
3199
- * Returns an array of Document objects, sorted by their sort_order.
3200
- */
3201
- async findChunksByIds(library, version, ids) {
3202
- if (!ids.length) return [];
3203
- try {
3204
- const placeholders = ids.map(() => "?").join(",");
3205
- const stmt = this.db.prepare(
3206
- `SELECT * FROM documents WHERE library = ? AND version = ? AND id IN (${placeholders}) ORDER BY sort_order`
3207
- );
3208
- const rows = stmt.all(
3209
- library.toLowerCase(),
3210
- version.toLowerCase(),
3211
- ...ids
3212
- );
3213
- return rows.map((row) => mapDbDocumentToDocument(row));
3214
- } catch (error) {
3215
- throw new ConnectionError("Failed to fetch documents by IDs", error);
3216
- }
3217
- }
3218
- }
3219
- class DocumentManagementService {
3220
- store;
3221
- documentRetriever;
3222
- splitter;
3223
- /**
3224
- * Normalizes a version string, converting null or undefined to an empty string
3225
- * and converting to lowercase.
3226
- */
3227
- normalizeVersion(version) {
3228
- return (version ?? "").toLowerCase();
3229
- }
3230
- constructor() {
3231
- let dbPath;
3232
- let dbDir;
3233
- const envStorePath = process.env.DOCS_MCP_STORE_PATH;
3234
- if (envStorePath) {
3235
- dbDir = envStorePath;
3236
- dbPath = path.join(dbDir, "documents.db");
3237
- logger.debug(`💾 Using database directory from DOCS_MCP_STORE_PATH: ${dbDir}`);
3238
- } else {
3239
- const projectRoot2 = getProjectRoot();
3240
- const oldDbDir = path.join(projectRoot2, ".store");
3241
- const oldDbPath = path.join(oldDbDir, "documents.db");
3242
- const oldDbExists = fs$1.existsSync(oldDbPath);
3243
- if (oldDbExists) {
3244
- dbPath = oldDbPath;
3245
- dbDir = oldDbDir;
3246
- logger.debug(`💾 Using legacy database path: ${dbPath}`);
3247
- } else {
3248
- const standardPaths = envPaths("docs-mcp-server", { suffix: "" });
3249
- dbDir = standardPaths.data;
3250
- dbPath = path.join(dbDir, "documents.db");
3251
- logger.debug(`💾 Using standard database directory: ${dbDir}`);
3252
- }
3253
- }
3254
- try {
3255
- fs$1.mkdirSync(dbDir, { recursive: true });
3256
- } catch (error) {
3257
- logger.error(`⚠️ Failed to create database directory ${dbDir}: ${error}`);
3258
- }
3259
- this.store = new DocumentStore(dbPath);
3260
- this.documentRetriever = new DocumentRetrieverService(this.store);
3261
- const semanticSplitter = new SemanticMarkdownSplitter(
3262
- SPLITTER_PREFERRED_CHUNK_SIZE,
3263
- SPLITTER_MAX_CHUNK_SIZE
3264
- );
3265
- const greedySplitter = new GreedySplitter(
3266
- semanticSplitter,
3267
- SPLITTER_MIN_CHUNK_SIZE,
3268
- SPLITTER_PREFERRED_CHUNK_SIZE
3269
- );
3270
- this.splitter = greedySplitter;
3271
- }
3272
- /**
3273
- * Initializes the underlying document store.
3274
- */
3275
- async initialize() {
3276
- await this.store.initialize();
3277
- }
3278
- /**
3279
- * Shuts down the underlying document store.
3280
- */
3281
- async shutdown() {
3282
- logger.info("🔌 Shutting down store manager");
3283
- await this.store.shutdown();
3284
- }
3285
- /**
3286
- * Validates if a library exists in the store (either versioned or unversioned).
3287
- * Throws LibraryNotFoundError with suggestions if the library is not found.
3288
- * @param library The name of the library to validate.
3289
- * @throws {LibraryNotFoundError} If the library does not exist.
3290
- */
3291
- async validateLibraryExists(library) {
3292
- logger.info(`🔎 Validating existence of library: ${library}`);
3293
- const normalizedLibrary = library.toLowerCase();
3294
- const versions = await this.listVersions(normalizedLibrary);
3295
- const hasUnversioned = await this.exists(normalizedLibrary, "");
3296
- if (versions.length === 0 && !hasUnversioned) {
3297
- logger.warn(`⚠️ Library '${library}' not found.`);
3298
- const allLibraries = await this.listLibraries();
3299
- const libraryNames = allLibraries.map((lib) => lib.library);
3300
- let suggestions = [];
3301
- if (libraryNames.length > 0) {
3302
- const fuse = new Fuse(libraryNames, {
3303
- // Configure fuse.js options if needed (e.g., threshold)
3304
- // isCaseSensitive: false, // Handled by normalizing library names
3305
- // includeScore: true,
3306
- threshold: 0.4
3307
- // Adjust threshold for desired fuzziness (0=exact, 1=match anything)
3308
- });
3309
- const results = fuse.search(normalizedLibrary);
3310
- suggestions = results.slice(0, 3).map((result) => result.item);
3311
- logger.info(`🔍 Found suggestions: ${suggestions.join(", ")}`);
3312
- }
3313
- throw new LibraryNotFoundError(library, suggestions);
3314
- }
3315
- logger.info(`✅ Library '${library}' confirmed to exist.`);
3316
- }
3317
- /**
3318
- * Returns a list of all available semantic versions for a library.
3319
- */
3320
- async listVersions(library) {
3321
- const versions = await this.store.queryUniqueVersions(library);
3322
- return versions.filter((v) => semver__default.valid(v)).map((version) => ({ version }));
3323
- }
3324
- /**
3325
- * Checks if documents exist for a given library and optional version.
3326
- * If version is omitted, checks for documents without a specific version.
3327
- */
3328
- async exists(library, version) {
3329
- const normalizedVersion = this.normalizeVersion(version);
3330
- return this.store.checkDocumentExists(library, normalizedVersion);
3331
- }
3332
- /**
3333
- * Finds the most appropriate version of documentation based on the requested version.
3334
- * When no target version is specified, returns the latest version.
3335
- *
3336
- * Version matching behavior:
3337
- * - Exact versions (e.g., "18.0.0"): Matches that version or any earlier version
3338
- * - X-Range patterns (e.g., "5.x", "5.2.x"): Matches within the specified range
3339
- * - "latest" or no version: Returns the latest available version
3340
- *
3341
- * For documentation, we prefer matching older versions over no match at all,
3342
- * since older docs are often still relevant and useful.
3343
- * Also checks if unversioned documents exist for the library.
3344
- */
3345
- async findBestVersion(library, targetVersion) {
3346
- logger.info(
3347
- `🔍 Finding best version for ${library}${targetVersion ? `@${targetVersion}` : ""}`
3348
- );
3349
- const hasUnversioned = await this.store.checkDocumentExists(library, "");
3350
- const validSemverVersions = await this.listVersions(library);
3351
- if (validSemverVersions.length === 0) {
3352
- if (hasUnversioned) {
3353
- logger.info(`ℹ️ Unversioned documents exist for ${library}`);
3354
- return { bestMatch: null, hasUnversioned: true };
3355
- }
3356
- logger.warn(`⚠️ No valid versions found for ${library}`);
3357
- const allLibraryDetails = await this.store.queryLibraryVersions();
3358
- const libraryDetails = allLibraryDetails.get(library) ?? [];
3359
- throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
3360
- }
3361
- const versionStrings = validSemverVersions.map((v) => v.version);
3362
- let bestMatch = null;
3363
- if (!targetVersion || targetVersion === "latest") {
3364
- bestMatch = semver__default.maxSatisfying(versionStrings, "*");
3365
- } else {
3366
- const versionRegex = /^(\d+)(?:\.(?:x(?:\.x)?|\d+(?:\.(?:x|\d+))?))?$|^$/;
3367
- if (!versionRegex.test(targetVersion)) {
3368
- logger.warn(`⚠️ Invalid target version format: ${targetVersion}`);
3369
- } else {
3370
- let range = targetVersion;
3371
- if (!semver__default.validRange(targetVersion)) {
3372
- range = `~${targetVersion}`;
3373
- } else if (semver__default.valid(targetVersion)) {
3374
- range = `${range} || <=${targetVersion}`;
3375
- }
3376
- bestMatch = semver__default.maxSatisfying(versionStrings, range);
3377
- }
3378
- }
3379
- if (bestMatch) {
3380
- logger.info(
3381
- `✅ Found best match version ${bestMatch} for ${library}@${targetVersion}`
3382
- );
3383
- } else {
3384
- logger.warn(`⚠️ No matching semver version found for ${library}@${targetVersion}`);
3385
- }
3386
- if (!bestMatch && !hasUnversioned) {
3387
- const allLibraryDetails = await this.store.queryLibraryVersions();
3388
- const libraryDetails = allLibraryDetails.get(library) ?? [];
3389
- throw new VersionNotFoundError(library, targetVersion ?? "", libraryDetails);
3390
- }
3391
- return { bestMatch, hasUnversioned };
3392
- }
3393
- /**
3394
- * Removes all documents for a specific library and optional version.
3395
- * If version is omitted, removes documents without a specific version.
3396
- */
3397
- async removeAllDocuments(library, version) {
3398
- const normalizedVersion = this.normalizeVersion(version);
3399
- logger.info(
3400
- `🗑️ Removing all documents from ${library}@${normalizedVersion || "[no version]"} store`
3401
- );
3402
- const count = await this.store.deleteDocuments(library, normalizedVersion);
3403
- logger.info(`📊 Deleted ${count} documents`);
3404
- }
3405
- /**
3406
- * Adds a document to the store, splitting it into smaller chunks for better search results.
3407
- * Uses SemanticMarkdownSplitter to maintain markdown structure and content types during splitting.
3408
- * Preserves hierarchical structure of documents and distinguishes between text and code segments.
3409
- * If version is omitted, the document is added without a specific version.
3410
- */
3411
- async addDocument(library, version, document) {
3412
- const normalizedVersion = this.normalizeVersion(version);
3413
- const url = document.metadata.url;
3414
- if (!url || typeof url !== "string" || !url.trim()) {
3415
- throw new StoreError("Document metadata must include a valid URL");
3416
- }
3417
- logger.info(`📚 Adding document: ${document.metadata.title}`);
3418
- if (!document.pageContent.trim()) {
3419
- throw new Error("Document content cannot be empty");
3420
- }
3421
- const chunks = await this.splitter.splitText(document.pageContent);
3422
- const splitDocs = chunks.map((chunk) => ({
3423
- pageContent: chunk.content,
3424
- metadata: {
3425
- ...document.metadata,
3426
- level: chunk.section.level,
3427
- path: chunk.section.path
3428
- }
3429
- }));
3430
- logger.info(`📄 Split document into ${splitDocs.length} chunks`);
3431
- await this.store.addDocuments(library, normalizedVersion, splitDocs);
3432
- }
3433
- /**
3434
- * Searches for documentation content across versions.
3435
- * Uses hybrid search (vector + FTS).
3436
- * If version is omitted, searches documents without a specific version.
3437
- */
3438
- async searchStore(library, version, query, limit = 5) {
3439
- const normalizedVersion = this.normalizeVersion(version);
3440
- return this.documentRetriever.search(library, normalizedVersion, query, limit);
3441
- }
3442
- async listLibraries() {
3443
- const libraryMap = await this.store.queryLibraryVersions();
3444
- return Array.from(libraryMap.entries()).map(([library, versions]) => ({
3445
- library,
3446
- versions
3447
- // The versions array already contains LibraryVersionDetails
3448
- }));
3449
- }
3450
- }
3451
- export {
3452
- DocumentManagementService as D,
3453
- FileFetcher as F,
3454
- HttpFetcher as H,
3455
- LibraryNotFoundError as L,
3456
- MarkdownPipeline as M,
3457
- PipelineJobStatus as P,
3458
- SearchTool as S,
3459
- ToolError as T,
3460
- VersionNotFoundError as V,
3461
- PipelineManager as a,
3462
- DEFAULT_MAX_DEPTH$1 as b,
3463
- DEFAULT_MAX_PAGES$1 as c,
3464
- LogLevel as d,
3465
- ScrapeTool as e,
3466
- ListLibrariesTool as f,
3467
- DEFAULT_PROTOCOL as g,
3468
- DEFAULT_HTTP_PORT as h,
3469
- DEFAULT_MAX_CONCURRENCY as i,
3470
- ScrapeMode as j,
3471
- HtmlPipeline as k,
3472
- logger as l,
3473
- ScraperError as m,
3474
- createJSDOM as n,
3475
- getProjectRoot as o,
3476
- DEFAULT_WEB_PORT as p,
3477
- DimensionError as q,
3478
- VECTOR_DIMENSION as r,
3479
- setLogLevel as s
3480
- };
3481
- //# sourceMappingURL=DocumentManagementService-BGW9iWNn.js.map