@arabold/docs-mcp-server 1.9.0 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -100,11 +100,6 @@ var require_extend = __commonJS({
100
100
  }
101
101
  });
102
102
 
103
- // src/config.ts
104
- var DEFAULT_MAX_PAGES = 1e3;
105
- var DEFAULT_MAX_DEPTH = 3;
106
- var DEFAULT_MAX_CONCURRENCY = 3;
107
-
108
103
  // src/utils/logger.ts
109
104
  var currentLogLevel = 2 /* INFO */;
110
105
  function setLogLevel(level) {
@@ -292,215 +287,13 @@ var FileFetcher = class {
292
287
  }
293
288
  };
294
289
 
295
- // src/scraper/processor/HtmlProcessor.ts
296
- import createDOMPurify from "dompurify";
297
- import { JSDOM } from "jsdom";
298
- import TurndownService from "turndown";
299
- var HtmlProcessor = class {
300
- turndownService;
301
- options;
302
- selectorsToRemove = [
303
- "nav",
304
- "footer",
305
- "script",
306
- "style",
307
- "noscript",
308
- "svg",
309
- "link",
310
- "meta",
311
- "iframe",
312
- "header",
313
- "button",
314
- "input",
315
- "textarea",
316
- "select",
317
- // "form", // Known issue: Some pages use alerts for important content
318
- ".ads",
319
- ".advertisement",
320
- ".banner",
321
- ".cookie-banner",
322
- ".cookie-consent",
323
- ".hidden",
324
- ".hide",
325
- ".modal",
326
- ".nav-bar",
327
- ".overlay",
328
- ".popup",
329
- ".promo",
330
- ".mw-editsection",
331
- ".side-bar",
332
- ".social-share",
333
- ".sticky",
334
- "#ads",
335
- "#banner",
336
- "#cookieBanner",
337
- "#modal",
338
- "#nav",
339
- "#overlay",
340
- "#popup",
341
- "#sidebar",
342
- "#socialMediaBox",
343
- "#stickyHeader",
344
- "#ad-container",
345
- ".ad-container",
346
- ".login-form",
347
- ".signup-form",
348
- ".tooltip",
349
- ".dropdown-menu",
350
- // ".alert", // Known issue: Some pages use alerts for important content
351
- ".breadcrumb",
352
- ".pagination",
353
- // '[role="alert"]', // Known issue: Some pages use alerts for important content
354
- '[role="banner"]',
355
- '[role="dialog"]',
356
- '[role="alertdialog"]',
357
- '[role="region"][aria-label*="skip" i]',
358
- '[aria-modal="true"]',
359
- ".noprint"
360
- ];
361
- constructor(options) {
362
- this.turndownService = new TurndownService({
363
- headingStyle: "atx",
364
- hr: "---",
365
- bulletListMarker: "-",
366
- codeBlockStyle: "fenced",
367
- emDelimiter: "_",
368
- strongDelimiter: "**",
369
- linkStyle: "inlined"
370
- });
371
- this.turndownService.addRule("pre", {
372
- filter: ["pre"],
373
- replacement: (content3, node2) => {
374
- const element = node2;
375
- let language = element.getAttribute("data-language") || "";
376
- if (!language) {
377
- const highlightElement = element.closest(
378
- '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
379
- );
380
- if (highlightElement) {
381
- const className = highlightElement.className;
382
- const match = className.match(
383
- /(?:highlight-source-|highlight-|language-)(\w+)/
384
- );
385
- if (match) {
386
- language = match[1];
387
- }
388
- }
389
- }
390
- const text3 = (() => {
391
- const clone = element.cloneNode(true);
392
- const brElements = Array.from(clone.querySelectorAll("br"));
393
- for (const br of brElements) {
394
- br.replaceWith("\n");
395
- }
396
- return clone.textContent;
397
- })();
398
- return `
399
- \`\`\`${language}
400
- ${text3}
401
- \`\`\`
402
- `;
403
- }
404
- });
405
- this.turndownService.addRule("table", {
406
- filter: ["table"],
407
- replacement: (content3) => {
408
- const cleanedContent = content3.replace(/\n+/g, "\n");
409
- return `
410
-
411
- ${cleanedContent}
412
-
413
- `;
414
- }
415
- });
416
- this.options = options || {};
417
- }
418
- canProcess(content3) {
419
- return content3.mimeType.startsWith("text/html");
420
- }
421
- async process(content3) {
422
- if (!this.canProcess(content3)) {
423
- throw new ScraperError(
424
- `HtmlProcessor cannot process content of type ${content3.mimeType}`,
425
- false
426
- );
427
- }
428
- const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
429
- const window = new JSDOM(htmlContent, { url: content3.source }).window;
430
- const title = window.document.title || "Untitled";
431
- const purify = createDOMPurify(window);
432
- const purifiedContent = purify.sanitize(htmlContent, {
433
- WHOLE_DOCUMENT: true,
434
- RETURN_DOM: true
435
- });
436
- const linkElements = purifiedContent.querySelectorAll("a[href]");
437
- let links = [];
438
- if (this.options.extractLinks !== false) {
439
- links = Array.from(linkElements).map((el) => el.getAttribute("href")).filter((href) => href !== null).map((href) => {
440
- try {
441
- return new URL(href, content3.source).href;
442
- } catch {
443
- return null;
444
- }
445
- }).filter((url) => url !== null);
446
- }
447
- const selectorsToRemove = [
448
- ...this.options.excludeSelectors || [],
449
- ...this.selectorsToRemove
450
- ];
451
- for (const selector of selectorsToRemove) {
452
- const elements = purifiedContent.querySelectorAll(selector);
453
- for (const el of elements) {
454
- el.remove();
455
- }
456
- }
457
- const cleanedContent = purifiedContent.innerHTML;
458
- const markdown = this.turndownService.turndown(cleanedContent || "").trim();
459
- if (!markdown) {
460
- throw new ScraperError("No valid content found", false);
461
- }
462
- return {
463
- content: markdown,
464
- title,
465
- source: content3.source,
466
- links,
467
- metadata: {}
468
- };
469
- }
470
- };
471
-
472
- // src/scraper/processor/MarkdownProcessor.ts
473
- var MarkdownProcessor = class {
474
- canProcess(content3) {
475
- return content3.mimeType === "text/markdown" || content3.mimeType === "text/plain" || // Treat plain text as markdown
476
- content3.source.endsWith(".md");
477
- }
478
- async process(content3) {
479
- if (!this.canProcess(content3)) {
480
- throw new ScraperError(
481
- `MarkdownProcessor cannot process content of type ${content3.mimeType}`,
482
- false
483
- );
484
- }
485
- const markdownContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
486
- if (!markdownContent.trim()) {
487
- throw new ScraperError("Empty Markdown content", false);
488
- }
489
- const title = this.extractTitle(markdownContent) || "Untitled";
490
- return {
491
- content: markdownContent,
492
- title,
493
- source: content3.source,
494
- links: [],
495
- // TODO: Extract links from Markdown
496
- metadata: {}
497
- };
498
- }
499
- extractTitle(markdown) {
500
- const match = markdown.match(/^#\s+(.*)$/m);
501
- return match ? match[1].trim() : null;
502
- }
503
- };
290
+ // src/scraper/types.ts
291
+ var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
292
+ ScrapeMode2["Fetch"] = "fetch";
293
+ ScrapeMode2["Playwright"] = "playwright";
294
+ ScrapeMode2["Auto"] = "auto";
295
+ return ScrapeMode2;
296
+ })(ScrapeMode || {});
504
297
 
505
298
  // node_modules/uuid/dist/esm-node/stringify.js
506
299
  var byteToHex = [];
@@ -609,6 +402,552 @@ function isSubpath(baseUrl, targetUrl) {
609
402
  return targetUrl.pathname.startsWith(basePath);
610
403
  }
611
404
 
405
+ // src/scraper/middleware/ContentProcessorPipeline.ts
406
+ var ContentProcessingPipeline = class {
407
+ middleware;
408
+ /**
409
+ * Creates an instance of ContentProcessingPipeline.
410
+ * @param middleware An array of middleware instances to execute in order.
411
+ */
412
+ constructor(middleware) {
413
+ this.middleware = middleware;
414
+ }
415
+ /**
416
+ * Executes the middleware pipeline with the given initial context.
417
+ * @param initialContext The starting context for the pipeline.
418
+ * @returns A promise that resolves with the final context after all middleware have executed.
419
+ */
420
+ async run(initialContext) {
421
+ let index2 = -1;
422
+ const dispatch = async (i) => {
423
+ if (i <= index2) {
424
+ throw new Error("next() called multiple times");
425
+ }
426
+ index2 = i;
427
+ const mw = this.middleware[i];
428
+ if (!mw) {
429
+ return;
430
+ }
431
+ const next = dispatch.bind(null, i + 1);
432
+ try {
433
+ await mw.process(initialContext, next);
434
+ } catch (error) {
435
+ initialContext.errors.push(
436
+ error instanceof Error ? error : new Error(String(error))
437
+ );
438
+ logger.warn(`Error in middleware pipeline: ${error}`);
439
+ }
440
+ };
441
+ await dispatch(0);
442
+ return initialContext;
443
+ }
444
+ };
445
+
446
+ // src/scraper/middleware/components/HtmlCheerioParserMiddleware.ts
447
+ import * as cheerio from "cheerio";
448
+ var HtmlCheerioParserMiddleware = class {
449
+ async process(context, next) {
450
+ if (!context.contentType.startsWith("text/html")) {
451
+ await next();
452
+ return;
453
+ }
454
+ const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
455
+ try {
456
+ logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
457
+ const $ = cheerio.load(htmlString);
458
+ context.dom = $;
459
+ await next();
460
+ } catch (error) {
461
+ logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
462
+ context.errors.push(
463
+ error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
464
+ );
465
+ return;
466
+ }
467
+ }
468
+ };
469
+
470
+ // src/utils/dom.ts
471
+ import { JSDOM, VirtualConsole } from "jsdom";
472
+ function createJSDOM(html, options) {
473
+ const virtualConsole = new VirtualConsole();
474
+ virtualConsole.on("error", () => {
475
+ });
476
+ virtualConsole.on("warn", () => {
477
+ });
478
+ virtualConsole.on("info", () => {
479
+ });
480
+ virtualConsole.on("debug", () => {
481
+ });
482
+ virtualConsole.on("log", () => {
483
+ });
484
+ const defaultOptions = {
485
+ virtualConsole
486
+ };
487
+ const finalOptions = { ...defaultOptions, ...options };
488
+ return new JSDOM(html, finalOptions);
489
+ }
490
+
491
+ // src/scraper/middleware/components/HtmlLinkExtractorMiddleware.ts
492
+ var HtmlLinkExtractorMiddleware = class {
493
+ /**
494
+ * Processes the context to extract links from the sanitized HTML body.
495
+ * @param context The current processing context.
496
+ * @param next Function to call the next middleware.
497
+ */
498
+ async process(context, next) {
499
+ const $ = context.dom;
500
+ if (!$) {
501
+ if (context.contentType.startsWith("text/html")) {
502
+ logger.warn(
503
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
504
+ );
505
+ }
506
+ await next();
507
+ return;
508
+ }
509
+ try {
510
+ const linkElements = $("a[href]");
511
+ logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
512
+ const extractedLinks = [];
513
+ linkElements.each((index2, element) => {
514
+ const href = $(element).attr("href");
515
+ if (href && href.trim() !== "") {
516
+ try {
517
+ const urlObj = new URL(href, context.source);
518
+ if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
519
+ logger.debug(`Ignoring link with invalid protocol: ${href}`);
520
+ return;
521
+ }
522
+ extractedLinks.push(urlObj.href);
523
+ } catch (e) {
524
+ logger.debug(`Ignoring invalid URL syntax: ${href}`);
525
+ }
526
+ }
527
+ });
528
+ context.links = [...new Set(extractedLinks)];
529
+ logger.debug(
530
+ `Extracted ${context.links.length} unique, valid links from ${context.source}`
531
+ );
532
+ } catch (error) {
533
+ logger.error(`Error extracting links from ${context.source}: ${error}`);
534
+ context.errors.push(
535
+ new Error(
536
+ `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
537
+ )
538
+ );
539
+ }
540
+ await next();
541
+ }
542
+ };
543
+
544
+ // src/scraper/middleware/components/HtmlMetadataExtractorMiddleware.ts
545
+ var HtmlMetadataExtractorMiddleware = class {
546
+ /**
547
+ * Processes the context to extract the HTML title.
548
+ * @param context The current processing context.
549
+ * @param next Function to call the next middleware.
550
+ */
551
+ async process(context, next) {
552
+ const $ = context.dom;
553
+ if (!$) {
554
+ if (context.contentType.startsWith("text/html")) {
555
+ logger.warn(
556
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
557
+ );
558
+ }
559
+ await next();
560
+ return;
561
+ }
562
+ try {
563
+ let title = $("title").first().text().trim();
564
+ if (!title) {
565
+ title = $("h1").first().text().trim();
566
+ }
567
+ title = title || "Untitled";
568
+ title = title.replace(/\s+/g, " ").trim();
569
+ context.metadata.title = title;
570
+ logger.debug(`Extracted title: "${title}" from ${context.source}`);
571
+ } catch (error) {
572
+ logger.error(`Error extracting metadata from ${context.source}: ${error}`);
573
+ context.errors.push(
574
+ new Error(
575
+ `Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
576
+ )
577
+ );
578
+ }
579
+ await next();
580
+ }
581
+ };
582
+
583
+ // src/scraper/middleware/components/HtmlPlaywrightMiddleware.ts
584
+ import { chromium } from "playwright";
585
+ var HtmlPlaywrightMiddleware = class {
586
+ browser = null;
587
+ /**
588
+ * Initializes the Playwright browser instance.
589
+ * Consider making this more robust (e.g., lazy initialization, singleton).
590
+ */
591
+ async ensureBrowser() {
592
+ if (!this.browser || !this.browser.isConnected()) {
593
+ const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
594
+ logger.debug(
595
+ `Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
596
+ );
597
+ this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
598
+ this.browser.on("disconnected", () => {
599
+ logger.debug("Playwright browser instance disconnected.");
600
+ this.browser = null;
601
+ });
602
+ }
603
+ return this.browser;
604
+ }
605
+ /**
606
+ * Closes the Playwright browser instance if it exists.
607
+ * Should be called during application shutdown.
608
+ */
609
+ async closeBrowser() {
610
+ if (this.browser?.isConnected()) {
611
+ logger.debug("Closing Playwright browser instance...");
612
+ await this.browser.close();
613
+ this.browser = null;
614
+ }
615
+ }
616
+ async process(context, next) {
617
+ if (!context.contentType.startsWith("text/html")) {
618
+ await next();
619
+ return;
620
+ }
621
+ const scrapeMode = context.options?.scrapeMode ?? "auto" /* Auto */;
622
+ const shouldRunPlaywright = scrapeMode === "playwright" /* Playwright */ || scrapeMode === "auto" /* Auto */;
623
+ if (!shouldRunPlaywright) {
624
+ logger.debug(
625
+ `Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
626
+ );
627
+ await next();
628
+ return;
629
+ }
630
+ logger.debug(
631
+ `Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
632
+ );
633
+ let page = null;
634
+ let renderedHtml = null;
635
+ try {
636
+ const browser = await this.ensureBrowser();
637
+ page = await browser.newPage();
638
+ logger.debug(`Playwright: Processing ${context.source}`);
639
+ await page.route("**/*", (route) => {
640
+ if (route.request().url() === context.source) {
641
+ return route.fulfill({
642
+ status: 200,
643
+ contentType: context.contentType,
644
+ body: context.content
645
+ });
646
+ }
647
+ const resourceType = route.request().resourceType();
648
+ if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
649
+ return route.abort();
650
+ }
651
+ return route.continue();
652
+ });
653
+ await page.goto(context.source, {
654
+ waitUntil: "load"
655
+ });
656
+ renderedHtml = await page.content();
657
+ logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
658
+ } catch (error) {
659
+ logger.error(`Playwright failed to render ${context.source}: ${error}`);
660
+ context.errors.push(
661
+ error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
662
+ );
663
+ } finally {
664
+ if (page) {
665
+ await page.unroute("**/*");
666
+ await page.close();
667
+ }
668
+ }
669
+ if (renderedHtml !== null) {
670
+ context.content = renderedHtml;
671
+ logger.debug(
672
+ `Playwright middleware updated content for ${context.source}. Proceeding.`
673
+ );
674
+ } else {
675
+ logger.warn(
676
+ `Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
677
+ );
678
+ }
679
+ await next();
680
+ }
681
+ };
682
+
683
+ // src/scraper/middleware/components/HtmlSanitizerMiddleware.ts
684
+ var HtmlSanitizerMiddleware = class {
685
+ // Default selectors to remove
686
+ defaultSelectorsToRemove = [
687
+ "nav",
688
+ "footer",
689
+ "script",
690
+ "style",
691
+ "noscript",
692
+ "svg",
693
+ "link",
694
+ "meta",
695
+ "iframe",
696
+ "header",
697
+ "button",
698
+ "input",
699
+ "textarea",
700
+ "select",
701
+ // "form", // Keep commented
702
+ ".ads",
703
+ ".advertisement",
704
+ ".banner",
705
+ ".cookie-banner",
706
+ ".cookie-consent",
707
+ ".hidden",
708
+ ".hide",
709
+ ".modal",
710
+ ".nav-bar",
711
+ ".overlay",
712
+ ".popup",
713
+ ".promo",
714
+ ".mw-editsection",
715
+ ".side-bar",
716
+ ".social-share",
717
+ ".sticky",
718
+ "#ads",
719
+ "#banner",
720
+ "#cookieBanner",
721
+ "#modal",
722
+ "#nav",
723
+ "#overlay",
724
+ "#popup",
725
+ "#sidebar",
726
+ "#socialMediaBox",
727
+ "#stickyHeader",
728
+ "#ad-container",
729
+ ".ad-container",
730
+ ".login-form",
731
+ ".signup-form",
732
+ ".tooltip",
733
+ ".dropdown-menu",
734
+ // ".alert", // Keep commented
735
+ ".breadcrumb",
736
+ ".pagination",
737
+ // '[role="alert"]', // Keep commented
738
+ '[role="banner"]',
739
+ '[role="dialog"]',
740
+ '[role="alertdialog"]',
741
+ '[role="region"][aria-label*="skip" i]',
742
+ '[aria-modal="true"]',
743
+ ".noprint"
744
+ ];
745
+ async process(context, next) {
746
+ const $ = context.dom;
747
+ if (!$) {
748
+ if (context.contentType.startsWith("text/html")) {
749
+ logger.warn(
750
+ `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
751
+ );
752
+ }
753
+ await next();
754
+ return;
755
+ }
756
+ try {
757
+ const selectorsToRemove = [
758
+ ...context.options.excludeSelectors || [],
759
+ // Use options from the context
760
+ ...this.defaultSelectorsToRemove
761
+ ];
762
+ logger.debug(
763
+ `Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
764
+ );
765
+ let removedCount = 0;
766
+ for (const selector of selectorsToRemove) {
767
+ try {
768
+ const elements = $(selector);
769
+ const count = elements.length;
770
+ if (count > 0) {
771
+ elements.remove();
772
+ removedCount += count;
773
+ }
774
+ } catch (selectorError) {
775
+ logger.warn(
776
+ `Potentially invalid selector "${selector}" during element removal: ${selectorError}`
777
+ );
778
+ context.errors.push(
779
+ new Error(`Invalid selector "${selector}": ${selectorError}`)
780
+ );
781
+ }
782
+ }
783
+ logger.debug(`Removed ${removedCount} elements for ${context.source}`);
784
+ } catch (error) {
785
+ logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
786
+ context.errors.push(
787
+ error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
788
+ );
789
+ }
790
+ await next();
791
+ }
792
+ };
793
+
794
+ // src/scraper/middleware/components/HtmlToMarkdownMiddleware.ts
795
+ import { gfm } from "@joplin/turndown-plugin-gfm";
796
+ import TurndownService from "turndown";
797
+ var HtmlToMarkdownMiddleware = class {
798
+ turndownService;
799
+ constructor() {
800
+ this.turndownService = new TurndownService({
801
+ headingStyle: "atx",
802
+ hr: "---",
803
+ bulletListMarker: "-",
804
+ codeBlockStyle: "fenced",
805
+ emDelimiter: "_",
806
+ strongDelimiter: "**",
807
+ linkStyle: "inlined"
808
+ });
809
+ this.turndownService.use(gfm);
810
+ this.addCustomRules();
811
+ }
812
+ addCustomRules() {
813
+ this.turndownService.addRule("pre", {
814
+ filter: ["pre"],
815
+ replacement: (content3, node2) => {
816
+ const element = node2;
817
+ let language = element.getAttribute("data-language") || "";
818
+ if (!language) {
819
+ const highlightElement = element.closest(
820
+ '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
821
+ ) || element.querySelector(
822
+ '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
823
+ );
824
+ if (highlightElement) {
825
+ const className = highlightElement.className;
826
+ const match = className.match(
827
+ /(?:highlight-source-|highlight-|language-)(\w+)/
828
+ );
829
+ if (match) language = match[1];
830
+ }
831
+ }
832
+ const brElements = Array.from(element.querySelectorAll("br"));
833
+ for (const br of brElements) {
834
+ br.replaceWith("\n");
835
+ }
836
+ const text3 = element.textContent || "";
837
+ return `
838
+ \`\`\`${language}
839
+ ${text3.replace(/^\n+|\n+$/g, "")}
840
+ \`\`\`
841
+ `;
842
+ }
843
+ });
844
+ this.turndownService.addRule("anchor", {
845
+ filter: ["a"],
846
+ replacement: (content3, node2) => {
847
+ const href = node2.getAttribute("href");
848
+ if (!content3 || content3 === "#") {
849
+ return "";
850
+ }
851
+ if (!href) {
852
+ return content3;
853
+ }
854
+ return `[${content3}](${href})`;
855
+ }
856
+ });
857
+ }
858
+ /**
859
+ * Processes the context to convert the sanitized HTML body node to Markdown.
860
+ * @param context The current processing context.
861
+ * @param next Function to call the next middleware.
862
+ */
863
+ async process(context, next) {
864
+ const $ = context.dom;
865
+ if (!$) {
866
+ if (context.contentType.startsWith("text/html")) {
867
+ logger.warn(
868
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
869
+ );
870
+ }
871
+ await next();
872
+ return;
873
+ }
874
+ try {
875
+ logger.debug(`Converting HTML content to Markdown for ${context.source}`);
876
+ const htmlToConvert = $("body").html() || $.html();
877
+ const markdown = this.turndownService.turndown(htmlToConvert).trim();
878
+ if (!markdown) {
879
+ const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
880
+ logger.warn(warnMsg);
881
+ context.content = "";
882
+ context.contentType = "text/markdown";
883
+ } else {
884
+ context.content = markdown;
885
+ context.contentType = "text/markdown";
886
+ logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
887
+ }
888
+ } catch (error) {
889
+ logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
890
+ context.errors.push(
891
+ new Error(
892
+ `Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
893
+ )
894
+ );
895
+ }
896
+ await next();
897
+ }
898
+ };
899
+
900
+ // src/scraper/middleware/components/MarkdownLinkExtractorMiddleware.ts
901
+ var MarkdownLinkExtractorMiddleware = class {
902
+ /**
903
+ * Processes the context. Currently a no-op regarding link extraction.
904
+ * @param context The current processing context.
905
+ * @param next Function to call the next middleware.
906
+ */
907
+ async process(context, next) {
908
+ if (context.contentType === "text/markdown") {
909
+ if (!Array.isArray(context.links)) {
910
+ context.links = [];
911
+ }
912
+ }
913
+ await next();
914
+ }
915
+ };
916
+
917
+ // src/scraper/middleware/components/MarkdownMetadataExtractorMiddleware.ts
918
+ var MarkdownMetadataExtractorMiddleware = class {
919
+ /**
920
+ * Processes the context to extract the title from Markdown.
921
+ * @param context The current processing context.
922
+ * @param next Function to call the next middleware.
923
+ */
924
+ async process(context, next) {
925
+ if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
926
+ try {
927
+ const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
928
+ if (typeof context.content !== "string") {
929
+ context.content = textContent;
930
+ }
931
+ let title = "Untitled";
932
+ if (context.contentType === "text/markdown") {
933
+ const match = textContent.match(/^#\s+(.*)$/m);
934
+ if (match?.[1]) {
935
+ title = match[1].trim();
936
+ }
937
+ }
938
+ context.metadata.title = title;
939
+ } catch (error) {
940
+ context.errors.push(
941
+ new Error(
942
+ `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
943
+ )
944
+ );
945
+ }
946
+ }
947
+ await next();
948
+ }
949
+ };
950
+
612
951
  // src/scraper/strategies/BaseScraperStrategy.ts
613
952
  import { URL as URL2 } from "node:url";
614
953
 
@@ -633,8 +972,8 @@ var CancellationError = class extends PipelineError {
633
972
  };
634
973
 
635
974
  // src/scraper/strategies/BaseScraperStrategy.ts
636
- var DEFAULT_MAX_PAGES2 = 100;
637
- var DEFAULT_MAX_DEPTH2 = 3;
975
+ var DEFAULT_MAX_PAGES = 100;
976
+ var DEFAULT_MAX_DEPTH = 3;
638
977
  var DEFAULT_CONCURRENCY = 3;
639
978
  var BaseScraperStrategy = class {
640
979
  visited = /* @__PURE__ */ new Set();
@@ -643,19 +982,14 @@ var BaseScraperStrategy = class {
643
982
  constructor(options = {}) {
644
983
  this.options = options;
645
984
  }
646
- getProcessor(mimeType) {
647
- if (mimeType.startsWith("text/html")) {
648
- return new HtmlProcessor();
649
- }
650
- return new MarkdownProcessor();
651
- }
985
+ // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
652
986
  async processBatch(batch, baseUrl, options, progressCallback, signal) {
653
987
  const results = await Promise.all(
654
988
  batch.map(async (item) => {
655
989
  if (signal?.aborted) {
656
990
  throw new CancellationError("Scraping cancelled during batch processing");
657
991
  }
658
- const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH2;
992
+ const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
659
993
  if (item.depth > maxDepth) {
660
994
  return [];
661
995
  }
@@ -663,7 +997,7 @@ var BaseScraperStrategy = class {
663
997
  const result = await this.processItem(item, options, void 0, signal);
664
998
  if (result.document) {
665
999
  this.pageCount++;
666
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
1000
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
667
1001
  logger.info(
668
1002
  `\u{1F310} Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
669
1003
  );
@@ -715,7 +1049,7 @@ var BaseScraperStrategy = class {
715
1049
  const baseUrl = new URL2(options.url);
716
1050
  const queue = [{ url: options.url, depth: 0 }];
717
1051
  this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
718
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
1052
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
719
1053
  const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
720
1054
  while (queue.length > 0 && this.pageCount < maxPages) {
721
1055
  if (signal?.aborted) {
@@ -749,9 +1083,12 @@ var BaseScraperStrategy = class {
749
1083
  var WebScraperStrategy = class extends BaseScraperStrategy {
750
1084
  httpFetcher = new HttpFetcher();
751
1085
  shouldFollowLinkFn;
1086
+ playwrightMiddleware;
1087
+ // Add member
752
1088
  constructor(options = {}) {
753
1089
  super({ urlNormalizerOptions: options.urlNormalizerOptions });
754
1090
  this.shouldFollowLinkFn = options.shouldFollowLink;
1091
+ this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
755
1092
  }
756
1093
  canHandle(url) {
757
1094
  try {
@@ -785,12 +1122,56 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
785
1122
  followRedirects: options.followRedirects
786
1123
  };
787
1124
  const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
788
- const processor = this.getProcessor(rawContent.mimeType);
789
- const result = await processor.process(rawContent);
1125
+ const initialContext = {
1126
+ content: rawContent.content,
1127
+ contentType: rawContent.mimeType,
1128
+ source: rawContent.source,
1129
+ // Use the final source URL after redirects
1130
+ metadata: {},
1131
+ links: [],
1132
+ errors: [],
1133
+ options,
1134
+ fetcher: this.httpFetcher
1135
+ };
1136
+ let pipeline;
1137
+ if (initialContext.contentType.startsWith("text/html")) {
1138
+ const htmlPipelineSteps = [
1139
+ this.playwrightMiddleware,
1140
+ // Use the instance member
1141
+ // TODO: Add HtmlJsExecutorMiddleware here if needed based on options
1142
+ new HtmlCheerioParserMiddleware(),
1143
+ // Always runs after content is finalized
1144
+ new HtmlMetadataExtractorMiddleware(),
1145
+ new HtmlLinkExtractorMiddleware(),
1146
+ new HtmlSanitizerMiddleware(),
1147
+ // Element remover
1148
+ new HtmlToMarkdownMiddleware()
1149
+ ];
1150
+ pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
1151
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
1152
+ pipeline = new ContentProcessingPipeline([
1153
+ new MarkdownMetadataExtractorMiddleware(),
1154
+ new MarkdownLinkExtractorMiddleware()
1155
+ // Placeholder for now
1156
+ ]);
1157
+ } else {
1158
+ logger.warn(
1159
+ `Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
1160
+ );
1161
+ return { document: void 0, links: [] };
1162
+ }
1163
+ const finalContext = await pipeline.run(initialContext);
1164
+ for (const err of finalContext.errors) {
1165
+ logger.warn(`Processing error for ${url}: ${err.message}`);
1166
+ }
1167
+ if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
1168
+ logger.warn(`No processable content found for ${url} after pipeline execution.`);
1169
+ return { document: void 0, links: finalContext.links };
1170
+ }
790
1171
  const baseUrl = new URL(options.url);
791
- const links = result.links.filter((link) => {
1172
+ const filteredLinks = finalContext.links.filter((link) => {
792
1173
  try {
793
- const targetUrl = new URL(link, baseUrl);
1174
+ const targetUrl = new URL(link);
794
1175
  const scope = options.scope || "subpages";
795
1176
  return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
796
1177
  } catch {
@@ -799,21 +1180,37 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
799
1180
  });
800
1181
  return {
801
1182
  document: {
802
- content: result.content,
1183
+ content: finalContext.content,
1184
+ // Final processed content (Markdown)
803
1185
  metadata: {
804
- url: result.source,
805
- title: result.title,
1186
+ url: finalContext.source,
1187
+ // URL after redirects
1188
+ // Ensure title is a string, default to "Untitled"
1189
+ title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
806
1190
  library: options.library,
807
1191
  version: options.version
1192
+ // Add other metadata from context if needed
808
1193
  }
809
1194
  },
810
- links
1195
+ links: filteredLinks
1196
+ // Use the filtered links
811
1197
  };
812
1198
  } catch (error) {
813
- logger.error(`Failed to scrape page ${url}: ${error}`);
1199
+ logger.error(`Failed processing page ${url}: ${error}`);
814
1200
  throw error;
815
1201
  }
816
1202
  }
1203
+ /**
1204
+ * Overrides the base scrape method to ensure the Playwright browser is closed
1205
+ * after the scraping process completes or errors out.
1206
+ */
1207
+ async scrape(options, progressCallback, signal) {
1208
+ try {
1209
+ await super.scrape(options, progressCallback, signal);
1210
+ } finally {
1211
+ await this.playwrightMiddleware.closeBrowser();
1212
+ }
1213
+ }
817
1214
  };
818
1215
 
819
1216
  // src/scraper/strategies/GitHubScraperStrategy.ts
@@ -883,18 +1280,58 @@ var LocalFileStrategy = class extends BaseScraperStrategy {
883
1280
  }
884
1281
  logger.info(`\u{1F4C4} Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
885
1282
  const rawContent = await this.fileFetcher.fetch(item.url);
886
- const processor = this.getProcessor(rawContent.mimeType);
887
- const result = await processor.process(rawContent);
1283
+ const initialContext = {
1284
+ content: rawContent.content,
1285
+ contentType: rawContent.mimeType,
1286
+ source: rawContent.source,
1287
+ // file:// URL
1288
+ metadata: {},
1289
+ links: [],
1290
+ // LocalFileStrategy doesn't extract links from file content itself
1291
+ errors: [],
1292
+ options
1293
+ // Pass the full options object
1294
+ };
1295
+ let pipeline;
1296
+ if (initialContext.contentType.startsWith("text/html")) {
1297
+ pipeline = new ContentProcessingPipeline([
1298
+ new HtmlCheerioParserMiddleware(),
1299
+ new HtmlMetadataExtractorMiddleware(),
1300
+ // No HtmlLinkExtractorMiddleware needed for local files
1301
+ new HtmlSanitizerMiddleware(),
1302
+ new HtmlToMarkdownMiddleware()
1303
+ ]);
1304
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
1305
+ initialContext.contentType.startsWith("text/")) {
1306
+ pipeline = new ContentProcessingPipeline([
1307
+ new MarkdownMetadataExtractorMiddleware()
1308
+ // No MarkdownLinkExtractorMiddleware needed for local files
1309
+ ]);
1310
+ } else {
1311
+ logger.warn(
1312
+ `Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
1313
+ );
1314
+ return { document: void 0, links: [] };
1315
+ }
1316
+ const finalContext = await pipeline.run(initialContext);
1317
+ for (const err of finalContext.errors) {
1318
+ logger.warn(`Processing error for ${filePath}: ${err.message}`);
1319
+ }
1320
+ const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
888
1321
  return {
889
1322
  document: {
890
- content: result.content,
1323
+ // Use the potentially empty string content
1324
+ content: finalContentString,
891
1325
  metadata: {
892
- url: item.url,
893
- title: result.title,
1326
+ url: finalContext.source,
1327
+ // Use context source (file:// URL)
1328
+ // Ensure title is a string, default to "Untitled"
1329
+ title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
894
1330
  library: options.library,
895
1331
  version: options.version
896
1332
  }
897
1333
  }
1334
+ // No links returned from file content processing
898
1335
  };
899
1336
  }
900
1337
  async scrape(options, progressCallback, signal) {
@@ -1007,7 +1444,7 @@ var PipelineWorker = class {
1007
1444
  async executeJob(job, callbacks) {
1008
1445
  const { id: jobId, library, version, options, abortController } = job;
1009
1446
  const signal = abortController.signal;
1010
- logger.info(`[${jobId}] Worker starting job for ${library}@${version}`);
1447
+ logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
1011
1448
  try {
1012
1449
  await this.scraperService.scrape(
1013
1450
  options,
@@ -1327,14 +1764,13 @@ var LibraryNotFoundError = class extends ToolError {
1327
1764
 
1328
1765
  // src/tools/FetchUrlTool.ts
1329
1766
  var FetchUrlTool = class {
1330
- constructor(httpFetcher, fileFetcher, processor) {
1331
- this.processor = processor;
1332
- this.fetchers = [httpFetcher, fileFetcher];
1333
- }
1334
1767
  /**
1335
1768
  * Collection of fetchers that will be tried in order for a given URL.
1336
1769
  */
1337
1770
  fetchers;
1771
+ constructor(httpFetcher, fileFetcher) {
1772
+ this.fetchers = [httpFetcher, fileFetcher];
1773
+ }
1338
1774
  /**
1339
1775
  * Fetches content from a URL and converts it to Markdown.
1340
1776
  * Supports both HTTP/HTTPS URLs and local file URLs (file://).
@@ -1342,7 +1778,7 @@ var FetchUrlTool = class {
1342
1778
  * @throws {ToolError} If fetching or processing fails
1343
1779
  */
1344
1780
  async execute(options) {
1345
- const { url } = options;
1781
+ const { url, scrapeMode = "auto" /* Auto */ } = options;
1346
1782
  const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
1347
1783
  const fetcherIndex = canFetchResults.findIndex((result) => result === true);
1348
1784
  if (fetcherIndex === -1) {
@@ -1352,18 +1788,88 @@ var FetchUrlTool = class {
1352
1788
  );
1353
1789
  }
1354
1790
  const fetcher = this.fetchers[fetcherIndex];
1791
+ const playwrightMiddleware = new HtmlPlaywrightMiddleware();
1355
1792
  try {
1356
1793
  logger.info(`\u{1F4E1} Fetching ${url}...`);
1357
1794
  const rawContent = await fetcher.fetch(url, {
1358
1795
  followRedirects: options.followRedirects ?? true,
1359
1796
  maxRetries: 3
1797
+ // Keep retries for fetching
1360
1798
  });
1361
- logger.info("\u{1F504} Converting to Markdown...");
1362
- const processed = await this.processor.process(rawContent);
1363
- logger.info(`\u2705 Successfully converted ${url} to Markdown`);
1364
- return processed.content;
1799
+ logger.info("\u{1F504} Processing content...");
1800
+ const initialContext = {
1801
+ content: rawContent.content,
1802
+ contentType: rawContent.mimeType,
1803
+ source: rawContent.source,
1804
+ metadata: {},
1805
+ links: [],
1806
+ // Links not needed for this tool's output
1807
+ errors: [],
1808
+ fetcher,
1809
+ // Create a minimal ScraperOptions object for the context
1810
+ options: {
1811
+ url,
1812
+ // Use the input URL
1813
+ library: "",
1814
+ // Not applicable for this tool
1815
+ version: "",
1816
+ // Use empty string instead of undefined
1817
+ // Default other options as needed by middleware
1818
+ maxDepth: 0,
1819
+ maxPages: 1,
1820
+ maxConcurrency: 1,
1821
+ scope: "subpages",
1822
+ // Default, though not used for single page fetch
1823
+ followRedirects: options.followRedirects ?? true,
1824
+ excludeSelectors: void 0,
1825
+ // Not currently configurable via this tool
1826
+ ignoreErrors: false,
1827
+ scrapeMode
1828
+ // Pass the scrapeMode
1829
+ }
1830
+ };
1831
+ let pipeline;
1832
+ if (initialContext.contentType.startsWith("text/html")) {
1833
+ const htmlPipelineSteps = [
1834
+ playwrightMiddleware,
1835
+ // Use the instantiated middleware
1836
+ new HtmlCheerioParserMiddleware(),
1837
+ // Always runs after content is finalized
1838
+ new HtmlMetadataExtractorMiddleware(),
1839
+ // Keep for potential future use
1840
+ // No Link Extractor needed for this tool
1841
+ new HtmlSanitizerMiddleware(),
1842
+ // Element remover
1843
+ new HtmlToMarkdownMiddleware()
1844
+ ];
1845
+ pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
1846
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
1847
+ pipeline = new ContentProcessingPipeline([
1848
+ new MarkdownMetadataExtractorMiddleware()
1849
+ // Extract title (though not used)
1850
+ // No further processing needed for Markdown/Plain text for this tool
1851
+ ]);
1852
+ } else {
1853
+ logger.warn(
1854
+ `Unsupported content type "${initialContext.contentType}" for ${url}. Returning raw content.`
1855
+ );
1856
+ const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
1857
+ return contentString;
1858
+ }
1859
+ const finalContext = await pipeline.run(initialContext);
1860
+ for (const err of finalContext.errors) {
1861
+ logger.warn(`Processing error for ${url}: ${err.message}`);
1862
+ }
1863
+ if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
1864
+ throw new ToolError(
1865
+ `Processing resulted in empty content for ${url}`,
1866
+ this.constructor.name
1867
+ );
1868
+ }
1869
+ logger.info(`\u2705 Successfully processed ${url}`);
1870
+ return finalContext.content;
1365
1871
  } catch (error) {
1366
- if (error instanceof ScraperError) {
1872
+ if (error instanceof ScraperError || error instanceof ToolError) {
1367
1873
  throw new ToolError(
1368
1874
  `Failed to fetch or process URL: ${error.message}`,
1369
1875
  this.constructor.name
@@ -1373,6 +1879,8 @@ var FetchUrlTool = class {
1373
1879
  `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
1374
1880
  this.constructor.name
1375
1881
  );
1882
+ } finally {
1883
+ await playwrightMiddleware.closeBrowser();
1376
1884
  }
1377
1885
  }
1378
1886
  };
@@ -1439,6 +1947,13 @@ var ListLibrariesTool = class {
1439
1947
  }
1440
1948
  };
1441
1949
 
1950
+ // src/utils/config.ts
1951
+ var DEFAULT_MAX_PAGES2 = 1e3;
1952
+ var DEFAULT_MAX_DEPTH2 = 3;
1953
+ var DEFAULT_MAX_CONCURRENCY = 3;
1954
+ var DEFAULT_PROTOCOL = "stdio";
1955
+ var DEFAULT_HTTP_PORT = 8e3;
1956
+
1442
1957
  // src/tools/ScrapeTool.ts
1443
1958
  import * as semver2 from "semver";
1444
1959
  var ScrapeTool = class {
@@ -1493,10 +2008,12 @@ var ScrapeTool = class {
1493
2008
  version: internalVersion,
1494
2009
  scope: scraperOptions?.scope ?? "subpages",
1495
2010
  followRedirects: scraperOptions?.followRedirects ?? true,
1496
- maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
1497
- maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
2011
+ maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES2,
2012
+ maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH2,
1498
2013
  maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
1499
- ignoreErrors: scraperOptions?.ignoreErrors ?? true
2014
+ ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2015
+ scrapeMode: scraperOptions?.scrapeMode ?? "auto" /* Auto */
2016
+ // Pass scrapeMode enum
1500
2017
  });
1501
2018
  logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
1502
2019
  options.onProgress?.({
@@ -1576,26 +2093,6 @@ var SearchTool = class {
1576
2093
  logger.info(`\u2705 Found ${results.length} matching results`);
1577
2094
  return { results };
1578
2095
  } catch (error) {
1579
- if (error instanceof LibraryNotFoundError) {
1580
- logger.info(`\u2139\uFE0F Library not found: ${error.message}`);
1581
- return {
1582
- results: [],
1583
- error: {
1584
- message: error.message,
1585
- suggestions: error.suggestions
1586
- }
1587
- };
1588
- }
1589
- if (error instanceof VersionNotFoundError) {
1590
- logger.info(`\u2139\uFE0F Version not found: ${error.message}`);
1591
- return {
1592
- results: [],
1593
- error: {
1594
- message: error.message,
1595
- availableVersions: error.availableVersions
1596
- }
1597
- };
1598
- }
1599
2096
  logger.error(
1600
2097
  `\u274C Search failed: ${error instanceof Error ? error.message : "Unknown error"}`
1601
2098
  );
@@ -1784,7 +2281,6 @@ import Fuse from "fuse.js";
1784
2281
  import semver3 from "semver";
1785
2282
 
1786
2283
  // src/splitter/SemanticMarkdownSplitter.ts
1787
- import { JSDOM as JSDOM2 } from "jsdom";
1788
2284
  import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter";
1789
2285
  import remarkGfm from "remark-gfm";
1790
2286
  import remarkHtml from "remark-html";
@@ -10601,7 +11097,7 @@ ${"```"}`;
10601
11097
  * Parse HTML
10602
11098
  */
10603
11099
  async parseHtml(html) {
10604
- const { window } = new JSDOM2(html);
11100
+ const { window } = createJSDOM(html);
10605
11101
  return window.document;
10606
11102
  }
10607
11103
  };
@@ -11570,26 +12066,29 @@ var DocumentManagementService = class {
11570
12066
  };
11571
12067
 
11572
12068
  export {
11573
- DEFAULT_MAX_PAGES,
11574
- DEFAULT_MAX_DEPTH,
11575
- DEFAULT_MAX_CONCURRENCY,
11576
12069
  setLogLevel,
11577
12070
  logger,
11578
12071
  HttpFetcher,
11579
12072
  FileFetcher,
11580
- HtmlProcessor,
12073
+ ScrapeMode,
11581
12074
  PipelineJobStatus,
11582
12075
  PipelineManager,
11583
12076
  CancelJobTool,
11584
12077
  VersionNotFoundError,
12078
+ LibraryNotFoundError,
11585
12079
  FetchUrlTool,
11586
12080
  FindVersionTool,
11587
12081
  GetJobInfoTool,
11588
12082
  ListJobsTool,
11589
12083
  ListLibrariesTool,
11590
12084
  RemoveTool,
12085
+ DEFAULT_MAX_PAGES2 as DEFAULT_MAX_PAGES,
12086
+ DEFAULT_MAX_DEPTH2 as DEFAULT_MAX_DEPTH,
12087
+ DEFAULT_MAX_CONCURRENCY,
12088
+ DEFAULT_PROTOCOL,
12089
+ DEFAULT_HTTP_PORT,
11591
12090
  ScrapeTool,
11592
12091
  SearchTool,
11593
12092
  DocumentManagementService
11594
12093
  };
11595
- //# sourceMappingURL=chunk-A5FW7XVC.js.map
12094
+ //# sourceMappingURL=chunk-VF2RUEVV.js.map