@arabold/docs-mcp-server 1.8.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -100,6 +100,11 @@ var require_extend = __commonJS({
100
100
  }
101
101
  });
102
102
 
103
+ // src/config.ts
104
+ var DEFAULT_MAX_PAGES = 1e3;
105
+ var DEFAULT_MAX_DEPTH = 3;
106
+ var DEFAULT_MAX_CONCURRENCY = 3;
107
+
103
108
  // src/utils/logger.ts
104
109
  var currentLogLevel = 2 /* INFO */;
105
110
  function setLogLevel(level) {
@@ -287,216 +292,13 @@ var FileFetcher = class {
287
292
  }
288
293
  };
289
294
 
290
- // src/scraper/processor/HtmlProcessor.ts
291
- import createDOMPurify from "dompurify";
292
- import { JSDOM } from "jsdom";
293
- import TurndownService from "turndown";
294
- var HtmlProcessor = class {
295
- turndownService;
296
- options;
297
- selectorsToRemove = [
298
- "nav",
299
- "footer",
300
- "script",
301
- "style",
302
- "noscript",
303
- "svg",
304
- "link",
305
- "meta",
306
- "iframe",
307
- "header",
308
- "button",
309
- "input",
310
- "textarea",
311
- "select",
312
- // "form", // Known issue: Some pages use alerts for important content
313
- ".ads",
314
- ".advertisement",
315
- ".banner",
316
- ".cookie-banner",
317
- ".cookie-consent",
318
- ".hidden",
319
- ".hide",
320
- ".modal",
321
- ".nav-bar",
322
- ".overlay",
323
- ".popup",
324
- ".promo",
325
- ".mw-editsection",
326
- ".side-bar",
327
- ".social-share",
328
- ".sticky",
329
- "#ads",
330
- "#banner",
331
- "#cookieBanner",
332
- "#modal",
333
- "#nav",
334
- "#overlay",
335
- "#popup",
336
- "#sidebar",
337
- "#socialMediaBox",
338
- "#stickyHeader",
339
- "#ad-container",
340
- ".ad-container",
341
- ".login-form",
342
- ".signup-form",
343
- ".tooltip",
344
- ".dropdown-menu",
345
- // ".alert", // Known issue: Some pages use alerts for important content
346
- ".breadcrumb",
347
- ".pagination",
348
- // '[role="alert"]', // Known issue: Some pages use alerts for important content
349
- '[role="banner"]',
350
- '[role="dialog"]',
351
- '[role="alertdialog"]',
352
- '[role="region"][aria-label*="skip" i]',
353
- '[aria-modal="true"]',
354
- ".noprint"
355
- ];
356
- constructor(options) {
357
- this.turndownService = new TurndownService({
358
- headingStyle: "atx",
359
- hr: "---",
360
- bulletListMarker: "-",
361
- codeBlockStyle: "fenced",
362
- emDelimiter: "_",
363
- strongDelimiter: "**",
364
- linkStyle: "inlined"
365
- });
366
- this.turndownService.addRule("pre", {
367
- filter: ["pre"],
368
- replacement: (content3, node2) => {
369
- const element = node2;
370
- let language = element.getAttribute("data-language") || "";
371
- if (!language) {
372
- const highlightElement = element.closest(
373
- '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
374
- );
375
- if (highlightElement) {
376
- const className = highlightElement.className;
377
- const match = className.match(
378
- /(?:highlight-source-|highlight-|language-)(\w+)/
379
- );
380
- if (match) {
381
- language = match[1];
382
- }
383
- }
384
- }
385
- const text3 = (() => {
386
- const clone = element.cloneNode(true);
387
- const brElements = Array.from(clone.querySelectorAll("br"));
388
- for (const br of brElements) {
389
- br.replaceWith("\n");
390
- }
391
- return clone.textContent;
392
- })();
393
- return `
394
- \`\`\`${language}
395
- ${text3}
396
- \`\`\`
397
- `;
398
- }
399
- });
400
- this.turndownService.addRule("table", {
401
- filter: ["table"],
402
- replacement: (content3) => {
403
- const cleanedContent = content3.replace(/\n+/g, "\n");
404
- return `
405
-
406
- ${cleanedContent}
407
-
408
- `;
409
- }
410
- });
411
- this.options = options || {};
412
- }
413
- canProcess(content3) {
414
- return content3.mimeType.startsWith("text/html");
415
- }
416
- async process(content3) {
417
- if (!this.canProcess(content3)) {
418
- throw new ScraperError(
419
- `HtmlProcessor cannot process content of type ${content3.mimeType}`,
420
- false
421
- );
422
- }
423
- const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
424
- const titleMatch = htmlContent.match(/<title>([^<]+)<\/title>/i);
425
- const title = titleMatch?.[1] || "Untitled";
426
- const window = new JSDOM(content3.content, { url: content3.source }).window;
427
- const purify = createDOMPurify(window);
428
- const purifiedContent = purify.sanitize(htmlContent, {
429
- WHOLE_DOCUMENT: true,
430
- RETURN_DOM: true
431
- });
432
- const linkElements = purifiedContent.querySelectorAll("a[href]");
433
- let links = [];
434
- if (this.options.extractLinks !== false) {
435
- links = Array.from(linkElements).map((el) => el.getAttribute("href")).filter((href) => href !== null).map((href) => {
436
- try {
437
- return new URL(href, content3.source).href;
438
- } catch {
439
- return null;
440
- }
441
- }).filter((url) => url !== null);
442
- }
443
- const selectorsToRemove = [
444
- ...this.options.excludeSelectors || [],
445
- ...this.selectorsToRemove
446
- ];
447
- for (const selector of selectorsToRemove) {
448
- const elements = purifiedContent.querySelectorAll(selector);
449
- for (const el of elements) {
450
- el.remove();
451
- }
452
- }
453
- const cleanedContent = purifiedContent.innerHTML;
454
- const markdown = this.turndownService.turndown(cleanedContent || "").trim();
455
- if (!markdown) {
456
- throw new ScraperError("No valid content found", false);
457
- }
458
- return {
459
- content: markdown,
460
- title,
461
- source: content3.source,
462
- links,
463
- metadata: {}
464
- };
465
- }
466
- };
467
-
468
- // src/scraper/processor/MarkdownProcessor.ts
469
- var MarkdownProcessor = class {
470
- canProcess(content3) {
471
- return content3.mimeType === "text/markdown" || content3.mimeType === "text/plain" || // Treat plain text as markdown
472
- content3.source.endsWith(".md");
473
- }
474
- async process(content3) {
475
- if (!this.canProcess(content3)) {
476
- throw new ScraperError(
477
- `MarkdownProcessor cannot process content of type ${content3.mimeType}`,
478
- false
479
- );
480
- }
481
- const markdownContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
482
- if (!markdownContent.trim()) {
483
- throw new ScraperError("Empty Markdown content", false);
484
- }
485
- const title = this.extractTitle(markdownContent) || "Untitled";
486
- return {
487
- content: markdownContent,
488
- title,
489
- source: content3.source,
490
- links: [],
491
- // TODO: Extract links from Markdown
492
- metadata: {}
493
- };
494
- }
495
- extractTitle(markdown) {
496
- const match = markdown.match(/^#\s+(.*)$/m);
497
- return match ? match[1].trim() : null;
498
- }
499
- };
295
+ // src/scraper/types.ts
296
+ var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
297
+ ScrapeMode2["Fetch"] = "fetch";
298
+ ScrapeMode2["Playwright"] = "playwright";
299
+ ScrapeMode2["Auto"] = "auto";
300
+ return ScrapeMode2;
301
+ })(ScrapeMode || {});
500
302
 
501
303
  // node_modules/uuid/dist/esm-node/stringify.js
502
304
  var byteToHex = [];
@@ -605,6 +407,541 @@ function isSubpath(baseUrl, targetUrl) {
605
407
  return targetUrl.pathname.startsWith(basePath);
606
408
  }
607
409
 
410
+ // src/scraper/middleware/ContentProcessorPipeline.ts
411
+ var ContentProcessingPipeline = class {
412
+ middleware;
413
+ /**
414
+ * Creates an instance of ContentProcessingPipeline.
415
+ * @param middleware An array of middleware instances to execute in order.
416
+ */
417
+ constructor(middleware) {
418
+ this.middleware = middleware;
419
+ }
420
+ /**
421
+ * Executes the middleware pipeline with the given initial context.
422
+ * @param initialContext The starting context for the pipeline.
423
+ * @returns A promise that resolves with the final context after all middleware have executed.
424
+ */
425
+ async run(initialContext) {
426
+ let index2 = -1;
427
+ const dispatch = async (i) => {
428
+ if (i <= index2) {
429
+ throw new Error("next() called multiple times");
430
+ }
431
+ index2 = i;
432
+ const mw = this.middleware[i];
433
+ if (!mw) {
434
+ return;
435
+ }
436
+ const next = dispatch.bind(null, i + 1);
437
+ try {
438
+ await mw.process(initialContext, next);
439
+ } catch (error) {
440
+ initialContext.errors.push(
441
+ error instanceof Error ? error : new Error(String(error))
442
+ );
443
+ logger.warn(`Error in middleware pipeline: ${error}`);
444
+ }
445
+ };
446
+ await dispatch(0);
447
+ return initialContext;
448
+ }
449
+ };
450
+
451
+ // src/scraper/middleware/components/HtmlCheerioParserMiddleware.ts
452
+ import * as cheerio from "cheerio";
453
+ var HtmlCheerioParserMiddleware = class {
454
+ async process(context, next) {
455
+ if (!context.contentType.startsWith("text/html")) {
456
+ await next();
457
+ return;
458
+ }
459
+ const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
460
+ try {
461
+ logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
462
+ const $ = cheerio.load(htmlString);
463
+ context.dom = $;
464
+ await next();
465
+ } catch (error) {
466
+ logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
467
+ context.errors.push(
468
+ error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
469
+ );
470
+ return;
471
+ }
472
+ }
473
+ };
474
+
475
+ // src/utils/dom.ts
476
+ import { JSDOM, VirtualConsole } from "jsdom";
477
+ function createJSDOM(html, options) {
478
+ const virtualConsole = new VirtualConsole();
479
+ virtualConsole.on("error", () => {
480
+ });
481
+ virtualConsole.on("warn", () => {
482
+ });
483
+ virtualConsole.on("info", () => {
484
+ });
485
+ virtualConsole.on("debug", () => {
486
+ });
487
+ virtualConsole.on("log", () => {
488
+ });
489
+ const defaultOptions = {
490
+ virtualConsole
491
+ };
492
+ const finalOptions = { ...defaultOptions, ...options };
493
+ return new JSDOM(html, finalOptions);
494
+ }
495
+
496
+ // src/scraper/middleware/components/HtmlLinkExtractorMiddleware.ts
497
+ var HtmlLinkExtractorMiddleware = class {
498
+ /**
499
+ * Processes the context to extract links from the sanitized HTML body.
500
+ * @param context The current processing context.
501
+ * @param next Function to call the next middleware.
502
+ */
503
+ async process(context, next) {
504
+ const $ = context.dom;
505
+ if (!$) {
506
+ if (context.contentType.startsWith("text/html")) {
507
+ logger.warn(
508
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
509
+ );
510
+ }
511
+ await next();
512
+ return;
513
+ }
514
+ try {
515
+ const linkElements = $("a[href]");
516
+ logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
517
+ const extractedLinks = [];
518
+ linkElements.each((index2, element) => {
519
+ const href = $(element).attr("href");
520
+ if (href && href.trim() !== "") {
521
+ try {
522
+ const urlObj = new URL(href, context.source);
523
+ if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
524
+ logger.debug(`Ignoring link with invalid protocol: ${href}`);
525
+ return;
526
+ }
527
+ extractedLinks.push(urlObj.href);
528
+ } catch (e) {
529
+ logger.debug(`Ignoring invalid URL syntax: ${href}`);
530
+ }
531
+ }
532
+ });
533
+ context.links = [...new Set(extractedLinks)];
534
+ logger.debug(
535
+ `Extracted ${context.links.length} unique, valid links from ${context.source}`
536
+ );
537
+ } catch (error) {
538
+ logger.error(`Error extracting links from ${context.source}: ${error}`);
539
+ context.errors.push(
540
+ new Error(
541
+ `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
542
+ )
543
+ );
544
+ }
545
+ await next();
546
+ }
547
+ };
548
+
549
+ // src/scraper/middleware/components/HtmlMetadataExtractorMiddleware.ts
550
+ var HtmlMetadataExtractorMiddleware = class {
551
+ /**
552
+ * Processes the context to extract the HTML title.
553
+ * @param context The current processing context.
554
+ * @param next Function to call the next middleware.
555
+ */
556
+ async process(context, next) {
557
+ const $ = context.dom;
558
+ if (!$) {
559
+ if (context.contentType.startsWith("text/html")) {
560
+ logger.warn(
561
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
562
+ );
563
+ }
564
+ await next();
565
+ return;
566
+ }
567
+ try {
568
+ let title = $("title").first().text().trim();
569
+ if (!title) {
570
+ title = $("h1").first().text().trim();
571
+ }
572
+ title = title || "Untitled";
573
+ title = title.replace(/\s+/g, " ").trim();
574
+ context.metadata.title = title;
575
+ logger.debug(`Extracted title: "${title}" from ${context.source}`);
576
+ } catch (error) {
577
+ logger.error(`Error extracting metadata from ${context.source}: ${error}`);
578
+ context.errors.push(
579
+ new Error(
580
+ `Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
581
+ )
582
+ );
583
+ }
584
+ await next();
585
+ }
586
+ };
587
+
588
+ // src/scraper/middleware/components/HtmlPlaywrightMiddleware.ts
589
+ import { chromium } from "playwright";
590
+ var HtmlPlaywrightMiddleware = class {
591
+ browser = null;
592
+ /**
593
+ * Initializes the Playwright browser instance.
594
+ * Consider making this more robust (e.g., lazy initialization, singleton).
595
+ */
596
+ async ensureBrowser() {
597
+ if (!this.browser || !this.browser.isConnected()) {
598
+ const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
599
+ logger.debug(
600
+ `Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
601
+ );
602
+ this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
603
+ this.browser.on("disconnected", () => {
604
+ logger.debug("Playwright browser instance disconnected.");
605
+ this.browser = null;
606
+ });
607
+ }
608
+ return this.browser;
609
+ }
610
+ /**
611
+ * Closes the Playwright browser instance if it exists.
612
+ * Should be called during application shutdown.
613
+ */
614
+ async closeBrowser() {
615
+ if (this.browser?.isConnected()) {
616
+ logger.debug("Closing Playwright browser instance...");
617
+ await this.browser.close();
618
+ this.browser = null;
619
+ }
620
+ }
621
+ async process(context, next) {
622
+ if (!context.contentType.startsWith("text/html")) {
623
+ await next();
624
+ return;
625
+ }
626
+ const scrapeMode = context.options?.scrapeMode ?? "auto" /* Auto */;
627
+ const shouldRunPlaywright = scrapeMode === "playwright" /* Playwright */ || scrapeMode === "auto" /* Auto */;
628
+ if (!shouldRunPlaywright) {
629
+ logger.debug(
630
+ `Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
631
+ );
632
+ await next();
633
+ return;
634
+ }
635
+ logger.debug(
636
+ `Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
637
+ );
638
+ let page = null;
639
+ let renderedHtml = null;
640
+ try {
641
+ const browser = await this.ensureBrowser();
642
+ page = await browser.newPage();
643
+ logger.debug(`Playwright: Processing ${context.source}`);
644
+ await page.route("**/*", (route) => {
645
+ if (route.request().url() === context.source) {
646
+ return route.fulfill({
647
+ status: 200,
648
+ contentType: context.contentType,
649
+ body: context.content
650
+ });
651
+ }
652
+ const resourceType = route.request().resourceType();
653
+ if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
654
+ return route.abort();
655
+ }
656
+ return route.continue();
657
+ });
658
+ await page.goto(context.source, {
659
+ waitUntil: "load"
660
+ });
661
+ renderedHtml = await page.content();
662
+ logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
663
+ } catch (error) {
664
+ logger.error(`Playwright failed to render ${context.source}: ${error}`);
665
+ context.errors.push(
666
+ error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
667
+ );
668
+ } finally {
669
+ if (page) {
670
+ await page.unroute("**/*");
671
+ await page.close();
672
+ }
673
+ }
674
+ if (renderedHtml !== null) {
675
+ context.content = renderedHtml;
676
+ logger.debug(
677
+ `Playwright middleware updated content for ${context.source}. Proceeding.`
678
+ );
679
+ } else {
680
+ logger.warn(
681
+ `Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
682
+ );
683
+ }
684
+ await next();
685
+ }
686
+ };
687
+
688
+ // src/scraper/middleware/components/HtmlSanitizerMiddleware.ts
689
+ var HtmlSanitizerMiddleware = class {
690
+ // Default selectors to remove
691
+ defaultSelectorsToRemove = [
692
+ "nav",
693
+ "footer",
694
+ "script",
695
+ "style",
696
+ "noscript",
697
+ "svg",
698
+ "link",
699
+ "meta",
700
+ "iframe",
701
+ "header",
702
+ "button",
703
+ "input",
704
+ "textarea",
705
+ "select",
706
+ // "form", // Keep commented
707
+ ".ads",
708
+ ".advertisement",
709
+ ".banner",
710
+ ".cookie-banner",
711
+ ".cookie-consent",
712
+ ".hidden",
713
+ ".hide",
714
+ ".modal",
715
+ ".nav-bar",
716
+ ".overlay",
717
+ ".popup",
718
+ ".promo",
719
+ ".mw-editsection",
720
+ ".side-bar",
721
+ ".social-share",
722
+ ".sticky",
723
+ "#ads",
724
+ "#banner",
725
+ "#cookieBanner",
726
+ "#modal",
727
+ "#nav",
728
+ "#overlay",
729
+ "#popup",
730
+ "#sidebar",
731
+ "#socialMediaBox",
732
+ "#stickyHeader",
733
+ "#ad-container",
734
+ ".ad-container",
735
+ ".login-form",
736
+ ".signup-form",
737
+ ".tooltip",
738
+ ".dropdown-menu",
739
+ // ".alert", // Keep commented
740
+ ".breadcrumb",
741
+ ".pagination",
742
+ // '[role="alert"]', // Keep commented
743
+ '[role="banner"]',
744
+ '[role="dialog"]',
745
+ '[role="alertdialog"]',
746
+ '[role="region"][aria-label*="skip" i]',
747
+ '[aria-modal="true"]',
748
+ ".noprint"
749
+ ];
750
+ async process(context, next) {
751
+ const $ = context.dom;
752
+ if (!$) {
753
+ if (context.contentType.startsWith("text/html")) {
754
+ logger.warn(
755
+ `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
756
+ );
757
+ }
758
+ await next();
759
+ return;
760
+ }
761
+ try {
762
+ const selectorsToRemove = [
763
+ ...context.options.excludeSelectors || [],
764
+ // Use options from the context
765
+ ...this.defaultSelectorsToRemove
766
+ ];
767
+ logger.debug(
768
+ `Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
769
+ );
770
+ let removedCount = 0;
771
+ for (const selector of selectorsToRemove) {
772
+ try {
773
+ const elements = $(selector);
774
+ const count = elements.length;
775
+ if (count > 0) {
776
+ elements.remove();
777
+ removedCount += count;
778
+ }
779
+ } catch (selectorError) {
780
+ logger.warn(
781
+ `Potentially invalid selector "${selector}" during element removal: ${selectorError}`
782
+ );
783
+ context.errors.push(
784
+ new Error(`Invalid selector "${selector}": ${selectorError}`)
785
+ );
786
+ }
787
+ }
788
+ logger.debug(`Removed ${removedCount} elements for ${context.source}`);
789
+ } catch (error) {
790
+ logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
791
+ context.errors.push(
792
+ error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
793
+ );
794
+ }
795
+ await next();
796
+ }
797
+ };
798
+
799
+ // src/scraper/middleware/components/HtmlToMarkdownMiddleware.ts
800
+ import { gfm } from "@joplin/turndown-plugin-gfm";
801
+ import TurndownService from "turndown";
802
+ var HtmlToMarkdownMiddleware = class {
803
+ turndownService;
804
+ constructor() {
805
+ this.turndownService = new TurndownService({
806
+ headingStyle: "atx",
807
+ hr: "---",
808
+ bulletListMarker: "-",
809
+ codeBlockStyle: "fenced",
810
+ emDelimiter: "_",
811
+ strongDelimiter: "**",
812
+ linkStyle: "inlined"
813
+ });
814
+ this.turndownService.use(gfm);
815
+ this.addCustomRules();
816
+ }
817
+ addCustomRules() {
818
+ this.turndownService.addRule("pre", {
819
+ filter: ["pre"],
820
+ replacement: (content3, node2) => {
821
+ const element = node2;
822
+ let language = element.getAttribute("data-language") || "";
823
+ if (!language) {
824
+ const highlightElement = element.closest(
825
+ '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
826
+ ) || element.querySelector(
827
+ '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
828
+ );
829
+ if (highlightElement) {
830
+ const className = highlightElement.className;
831
+ const match = className.match(
832
+ /(?:highlight-source-|highlight-|language-)(\w+)/
833
+ );
834
+ if (match) language = match[1];
835
+ }
836
+ }
837
+ const brElements = element.querySelectorAll("br");
838
+ if (brElements.length > 0) {
839
+ for (const br of brElements) {
840
+ br.replaceWith("\n");
841
+ }
842
+ }
843
+ const text3 = element.textContent || "";
844
+ return `
845
+ \`\`\`${language}
846
+ ${text3.replace(/^\n+|\n+$/g, "")}
847
+ \`\`\`
848
+ `;
849
+ }
850
+ });
851
+ }
852
+ /**
853
+ * Processes the context to convert the sanitized HTML body node to Markdown.
854
+ * @param context The current processing context.
855
+ * @param next Function to call the next middleware.
856
+ */
857
+ async process(context, next) {
858
+ const $ = context.dom;
859
+ if (!$) {
860
+ if (context.contentType.startsWith("text/html")) {
861
+ logger.warn(
862
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
863
+ );
864
+ }
865
+ await next();
866
+ return;
867
+ }
868
+ try {
869
+ logger.debug(`Converting HTML content to Markdown for ${context.source}`);
870
+ const htmlToConvert = $("body").html() || $.html();
871
+ const markdown = this.turndownService.turndown(htmlToConvert).trim();
872
+ if (!markdown) {
873
+ const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
874
+ logger.warn(warnMsg);
875
+ context.content = "";
876
+ context.contentType = "text/markdown";
877
+ } else {
878
+ context.content = markdown;
879
+ context.contentType = "text/markdown";
880
+ logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
881
+ }
882
+ } catch (error) {
883
+ logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
884
+ context.errors.push(
885
+ new Error(
886
+ `Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
887
+ )
888
+ );
889
+ }
890
+ await next();
891
+ }
892
+ };
893
+
894
+ // src/scraper/middleware/components/MarkdownLinkExtractorMiddleware.ts
895
+ var MarkdownLinkExtractorMiddleware = class {
896
+ /**
897
+ * Processes the context. Currently a no-op regarding link extraction.
898
+ * @param context The current processing context.
899
+ * @param next Function to call the next middleware.
900
+ */
901
+ async process(context, next) {
902
+ if (context.contentType === "text/markdown") {
903
+ if (!Array.isArray(context.links)) {
904
+ context.links = [];
905
+ }
906
+ }
907
+ await next();
908
+ }
909
+ };
910
+
911
+ // src/scraper/middleware/components/MarkdownMetadataExtractorMiddleware.ts
912
+ var MarkdownMetadataExtractorMiddleware = class {
913
+ /**
914
+ * Processes the context to extract the title from Markdown.
915
+ * @param context The current processing context.
916
+ * @param next Function to call the next middleware.
917
+ */
918
+ async process(context, next) {
919
+ if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
920
+ try {
921
+ const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
922
+ if (typeof context.content !== "string") {
923
+ context.content = textContent;
924
+ }
925
+ let title = "Untitled";
926
+ if (context.contentType === "text/markdown") {
927
+ const match = textContent.match(/^#\s+(.*)$/m);
928
+ if (match?.[1]) {
929
+ title = match[1].trim();
930
+ }
931
+ }
932
+ context.metadata.title = title;
933
+ } catch (error) {
934
+ context.errors.push(
935
+ new Error(
936
+ `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
937
+ )
938
+ );
939
+ }
940
+ }
941
+ await next();
942
+ }
943
+ };
944
+
608
945
  // src/scraper/strategies/BaseScraperStrategy.ts
609
946
  import { URL as URL2 } from "node:url";
610
947
 
@@ -629,8 +966,8 @@ var CancellationError = class extends PipelineError {
629
966
  };
630
967
 
631
968
  // src/scraper/strategies/BaseScraperStrategy.ts
632
- var DEFAULT_MAX_PAGES = 100;
633
- var DEFAULT_MAX_DEPTH = 3;
969
+ var DEFAULT_MAX_PAGES2 = 100;
970
+ var DEFAULT_MAX_DEPTH2 = 3;
634
971
  var DEFAULT_CONCURRENCY = 3;
635
972
  var BaseScraperStrategy = class {
636
973
  visited = /* @__PURE__ */ new Set();
@@ -639,19 +976,14 @@ var BaseScraperStrategy = class {
639
976
  constructor(options = {}) {
640
977
  this.options = options;
641
978
  }
642
- getProcessor(mimeType) {
643
- if (mimeType.startsWith("text/html")) {
644
- return new HtmlProcessor();
645
- }
646
- return new MarkdownProcessor();
647
- }
979
+ // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
648
980
  async processBatch(batch, baseUrl, options, progressCallback, signal) {
649
981
  const results = await Promise.all(
650
982
  batch.map(async (item) => {
651
983
  if (signal?.aborted) {
652
984
  throw new CancellationError("Scraping cancelled during batch processing");
653
985
  }
654
- const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
986
+ const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH2;
655
987
  if (item.depth > maxDepth) {
656
988
  return [];
657
989
  }
@@ -659,7 +991,7 @@ var BaseScraperStrategy = class {
659
991
  const result = await this.processItem(item, options, void 0, signal);
660
992
  if (result.document) {
661
993
  this.pageCount++;
662
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
994
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
663
995
  logger.info(
664
996
  `\u{1F310} Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
665
997
  );
@@ -711,7 +1043,7 @@ var BaseScraperStrategy = class {
711
1043
  const baseUrl = new URL2(options.url);
712
1044
  const queue = [{ url: options.url, depth: 0 }];
713
1045
  this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
714
- const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
1046
+ const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
715
1047
  const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
716
1048
  while (queue.length > 0 && this.pageCount < maxPages) {
717
1049
  if (signal?.aborted) {
@@ -745,9 +1077,12 @@ var BaseScraperStrategy = class {
745
1077
  var WebScraperStrategy = class extends BaseScraperStrategy {
746
1078
  httpFetcher = new HttpFetcher();
747
1079
  shouldFollowLinkFn;
1080
+ playwrightMiddleware;
1081
+ // Add member
748
1082
  constructor(options = {}) {
749
1083
  super({ urlNormalizerOptions: options.urlNormalizerOptions });
750
1084
  this.shouldFollowLinkFn = options.shouldFollowLink;
1085
+ this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
751
1086
  }
752
1087
  canHandle(url) {
753
1088
  try {
@@ -781,12 +1116,56 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
781
1116
  followRedirects: options.followRedirects
782
1117
  };
783
1118
  const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
784
- const processor = this.getProcessor(rawContent.mimeType);
785
- const result = await processor.process(rawContent);
1119
+ const initialContext = {
1120
+ content: rawContent.content,
1121
+ contentType: rawContent.mimeType,
1122
+ source: rawContent.source,
1123
+ // Use the final source URL after redirects
1124
+ metadata: {},
1125
+ links: [],
1126
+ errors: [],
1127
+ options,
1128
+ fetcher: this.httpFetcher
1129
+ };
1130
+ let pipeline;
1131
+ if (initialContext.contentType.startsWith("text/html")) {
1132
+ const htmlPipelineSteps = [
1133
+ this.playwrightMiddleware,
1134
+ // Use the instance member
1135
+ // TODO: Add HtmlJsExecutorMiddleware here if needed based on options
1136
+ new HtmlCheerioParserMiddleware(),
1137
+ // Always runs after content is finalized
1138
+ new HtmlMetadataExtractorMiddleware(),
1139
+ new HtmlLinkExtractorMiddleware(),
1140
+ new HtmlSanitizerMiddleware(),
1141
+ // Element remover
1142
+ new HtmlToMarkdownMiddleware()
1143
+ ];
1144
+ pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
1145
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
1146
+ pipeline = new ContentProcessingPipeline([
1147
+ new MarkdownMetadataExtractorMiddleware(),
1148
+ new MarkdownLinkExtractorMiddleware()
1149
+ // Placeholder for now
1150
+ ]);
1151
+ } else {
1152
+ logger.warn(
1153
+ `Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
1154
+ );
1155
+ return { document: void 0, links: [] };
1156
+ }
1157
+ const finalContext = await pipeline.run(initialContext);
1158
+ for (const err of finalContext.errors) {
1159
+ logger.warn(`Processing error for ${url}: ${err.message}`);
1160
+ }
1161
+ if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
1162
+ logger.warn(`No processable content found for ${url} after pipeline execution.`);
1163
+ return { document: void 0, links: finalContext.links };
1164
+ }
786
1165
  const baseUrl = new URL(options.url);
787
- const links = result.links.filter((link) => {
1166
+ const filteredLinks = finalContext.links.filter((link) => {
788
1167
  try {
789
- const targetUrl = new URL(link, baseUrl);
1168
+ const targetUrl = new URL(link);
790
1169
  const scope = options.scope || "subpages";
791
1170
  return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
792
1171
  } catch {
@@ -795,21 +1174,37 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
795
1174
  });
796
1175
  return {
797
1176
  document: {
798
- content: result.content,
1177
+ content: finalContext.content,
1178
+ // Final processed content (Markdown)
799
1179
  metadata: {
800
- url: result.source,
801
- title: result.title,
1180
+ url: finalContext.source,
1181
+ // URL after redirects
1182
+ // Ensure title is a string, default to "Untitled"
1183
+ title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
802
1184
  library: options.library,
803
1185
  version: options.version
1186
+ // Add other metadata from context if needed
804
1187
  }
805
1188
  },
806
- links
1189
+ links: filteredLinks
1190
+ // Use the filtered links
807
1191
  };
808
1192
  } catch (error) {
809
- logger.error(`Failed to scrape page ${url}: ${error}`);
1193
+ logger.error(`Failed processing page ${url}: ${error}`);
810
1194
  throw error;
811
1195
  }
812
1196
  }
1197
+ /**
1198
+ * Overrides the base scrape method to ensure the Playwright browser is closed
1199
+ * after the scraping process completes or errors out.
1200
+ */
1201
+ async scrape(options, progressCallback, signal) {
1202
+ try {
1203
+ await super.scrape(options, progressCallback, signal);
1204
+ } finally {
1205
+ await this.playwrightMiddleware.closeBrowser();
1206
+ }
1207
+ }
813
1208
  };
814
1209
 
815
1210
  // src/scraper/strategies/GitHubScraperStrategy.ts
@@ -879,18 +1274,58 @@ var LocalFileStrategy = class extends BaseScraperStrategy {
879
1274
  }
880
1275
  logger.info(`\u{1F4C4} Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
881
1276
  const rawContent = await this.fileFetcher.fetch(item.url);
882
- const processor = this.getProcessor(rawContent.mimeType);
883
- const result = await processor.process(rawContent);
1277
+ const initialContext = {
1278
+ content: rawContent.content,
1279
+ contentType: rawContent.mimeType,
1280
+ source: rawContent.source,
1281
+ // file:// URL
1282
+ metadata: {},
1283
+ links: [],
1284
+ // LocalFileStrategy doesn't extract links from file content itself
1285
+ errors: [],
1286
+ options
1287
+ // Pass the full options object
1288
+ };
1289
+ let pipeline;
1290
+ if (initialContext.contentType.startsWith("text/html")) {
1291
+ pipeline = new ContentProcessingPipeline([
1292
+ new HtmlCheerioParserMiddleware(),
1293
+ new HtmlMetadataExtractorMiddleware(),
1294
+ // No HtmlLinkExtractorMiddleware needed for local files
1295
+ new HtmlSanitizerMiddleware(),
1296
+ new HtmlToMarkdownMiddleware()
1297
+ ]);
1298
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
1299
+ initialContext.contentType.startsWith("text/")) {
1300
+ pipeline = new ContentProcessingPipeline([
1301
+ new MarkdownMetadataExtractorMiddleware()
1302
+ // No MarkdownLinkExtractorMiddleware needed for local files
1303
+ ]);
1304
+ } else {
1305
+ logger.warn(
1306
+ `Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
1307
+ );
1308
+ return { document: void 0, links: [] };
1309
+ }
1310
+ const finalContext = await pipeline.run(initialContext);
1311
+ for (const err of finalContext.errors) {
1312
+ logger.warn(`Processing error for ${filePath}: ${err.message}`);
1313
+ }
1314
+ const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
884
1315
  return {
885
1316
  document: {
886
- content: result.content,
1317
+ // Use the potentially empty string content
1318
+ content: finalContentString,
887
1319
  metadata: {
888
- url: item.url,
889
- title: result.title,
1320
+ url: finalContext.source,
1321
+ // Use context source (file:// URL)
1322
+ // Ensure title is a string, default to "Untitled"
1323
+ title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
890
1324
  library: options.library,
891
1325
  version: options.version
892
1326
  }
893
1327
  }
1328
+ // No links returned from file content processing
894
1329
  };
895
1330
  }
896
1331
  async scrape(options, progressCallback, signal) {
@@ -1003,7 +1438,7 @@ var PipelineWorker = class {
1003
1438
  async executeJob(job, callbacks) {
1004
1439
  const { id: jobId, library, version, options, abortController } = job;
1005
1440
  const signal = abortController.signal;
1006
- logger.info(`[${jobId}] Worker starting job for ${library}@${version}`);
1441
+ logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
1007
1442
  try {
1008
1443
  await this.scraperService.scrape(
1009
1444
  options,
@@ -1323,14 +1758,13 @@ var LibraryNotFoundError = class extends ToolError {
1323
1758
 
1324
1759
  // src/tools/FetchUrlTool.ts
1325
1760
  var FetchUrlTool = class {
1326
- constructor(httpFetcher, fileFetcher, processor) {
1327
- this.processor = processor;
1328
- this.fetchers = [httpFetcher, fileFetcher];
1329
- }
1330
1761
  /**
1331
1762
  * Collection of fetchers that will be tried in order for a given URL.
1332
1763
  */
1333
1764
  fetchers;
1765
+ constructor(httpFetcher, fileFetcher) {
1766
+ this.fetchers = [httpFetcher, fileFetcher];
1767
+ }
1334
1768
  /**
1335
1769
  * Fetches content from a URL and converts it to Markdown.
1336
1770
  * Supports both HTTP/HTTPS URLs and local file URLs (file://).
@@ -1338,7 +1772,7 @@ var FetchUrlTool = class {
1338
1772
  * @throws {ToolError} If fetching or processing fails
1339
1773
  */
1340
1774
  async execute(options) {
1341
- const { url } = options;
1775
+ const { url, scrapeMode = "auto" /* Auto */ } = options;
1342
1776
  const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
1343
1777
  const fetcherIndex = canFetchResults.findIndex((result) => result === true);
1344
1778
  if (fetcherIndex === -1) {
@@ -1348,18 +1782,88 @@ var FetchUrlTool = class {
1348
1782
  );
1349
1783
  }
1350
1784
  const fetcher = this.fetchers[fetcherIndex];
1785
+ const playwrightMiddleware = new HtmlPlaywrightMiddleware();
1351
1786
  try {
1352
1787
  logger.info(`\u{1F4E1} Fetching ${url}...`);
1353
1788
  const rawContent = await fetcher.fetch(url, {
1354
1789
  followRedirects: options.followRedirects ?? true,
1355
1790
  maxRetries: 3
1791
+ // Keep retries for fetching
1356
1792
  });
1357
- logger.info("\u{1F504} Converting to Markdown...");
1358
- const processed = await this.processor.process(rawContent);
1359
- logger.info(`\u2705 Successfully converted ${url} to Markdown`);
1360
- return processed.content;
1793
+ logger.info("\u{1F504} Processing content...");
1794
+ const initialContext = {
1795
+ content: rawContent.content,
1796
+ contentType: rawContent.mimeType,
1797
+ source: rawContent.source,
1798
+ metadata: {},
1799
+ links: [],
1800
+ // Links not needed for this tool's output
1801
+ errors: [],
1802
+ fetcher,
1803
+ // Create a minimal ScraperOptions object for the context
1804
+ options: {
1805
+ url,
1806
+ // Use the input URL
1807
+ library: "",
1808
+ // Not applicable for this tool
1809
+ version: "",
1810
+ // Use empty string instead of undefined
1811
+ // Default other options as needed by middleware
1812
+ maxDepth: 0,
1813
+ maxPages: 1,
1814
+ maxConcurrency: 1,
1815
+ scope: "subpages",
1816
+ // Default, though not used for single page fetch
1817
+ followRedirects: options.followRedirects ?? true,
1818
+ excludeSelectors: void 0,
1819
+ // Not currently configurable via this tool
1820
+ ignoreErrors: false,
1821
+ scrapeMode
1822
+ // Pass the scrapeMode
1823
+ }
1824
+ };
1825
+ let pipeline;
1826
+ if (initialContext.contentType.startsWith("text/html")) {
1827
+ const htmlPipelineSteps = [
1828
+ playwrightMiddleware,
1829
+ // Use the instantiated middleware
1830
+ new HtmlCheerioParserMiddleware(),
1831
+ // Always runs after content is finalized
1832
+ new HtmlMetadataExtractorMiddleware(),
1833
+ // Keep for potential future use
1834
+ // No Link Extractor needed for this tool
1835
+ new HtmlSanitizerMiddleware(),
1836
+ // Element remover
1837
+ new HtmlToMarkdownMiddleware()
1838
+ ];
1839
+ pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
1840
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
1841
+ pipeline = new ContentProcessingPipeline([
1842
+ new MarkdownMetadataExtractorMiddleware()
1843
+ // Extract title (though not used)
1844
+ // No further processing needed for Markdown/Plain text for this tool
1845
+ ]);
1846
+ } else {
1847
+ logger.warn(
1848
+ `Unsupported content type "${initialContext.contentType}" for ${url}. Returning raw content.`
1849
+ );
1850
+ const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
1851
+ return contentString;
1852
+ }
1853
+ const finalContext = await pipeline.run(initialContext);
1854
+ for (const err of finalContext.errors) {
1855
+ logger.warn(`Processing error for ${url}: ${err.message}`);
1856
+ }
1857
+ if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
1858
+ throw new ToolError(
1859
+ `Processing resulted in empty content for ${url}`,
1860
+ this.constructor.name
1861
+ );
1862
+ }
1863
+ logger.info(`\u2705 Successfully processed ${url}`);
1864
+ return finalContext.content;
1361
1865
  } catch (error) {
1362
- if (error instanceof ScraperError) {
1866
+ if (error instanceof ScraperError || error instanceof ToolError) {
1363
1867
  throw new ToolError(
1364
1868
  `Failed to fetch or process URL: ${error.message}`,
1365
1869
  this.constructor.name
@@ -1369,6 +1873,8 @@ var FetchUrlTool = class {
1369
1873
  `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
1370
1874
  this.constructor.name
1371
1875
  );
1876
+ } finally {
1877
+ await playwrightMiddleware.closeBrowser();
1372
1878
  }
1373
1879
  }
1374
1880
  };
@@ -1489,10 +1995,12 @@ var ScrapeTool = class {
1489
1995
  version: internalVersion,
1490
1996
  scope: scraperOptions?.scope ?? "subpages",
1491
1997
  followRedirects: scraperOptions?.followRedirects ?? true,
1492
- maxPages: scraperOptions?.maxPages ?? 100,
1493
- maxDepth: scraperOptions?.maxDepth ?? 3,
1494
- // maxConcurrency is handled by the manager itself now
1495
- ignoreErrors: scraperOptions?.ignoreErrors ?? true
1998
+ maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
1999
+ maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
2000
+ maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
2001
+ ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2002
+ scrapeMode: scraperOptions?.scrapeMode ?? "auto" /* Auto */
2003
+ // Pass scrapeMode enum
1496
2004
  });
1497
2005
  logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
1498
2006
  options.onProgress?.({
@@ -1780,7 +2288,6 @@ import Fuse from "fuse.js";
1780
2288
  import semver3 from "semver";
1781
2289
 
1782
2290
  // src/splitter/SemanticMarkdownSplitter.ts
1783
- import { JSDOM as JSDOM2 } from "jsdom";
1784
2291
  import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter";
1785
2292
  import remarkGfm from "remark-gfm";
1786
2293
  import remarkHtml from "remark-html";
@@ -10597,7 +11104,7 @@ ${"```"}`;
10597
11104
  * Parse HTML
10598
11105
  */
10599
11106
  async parseHtml(html) {
10600
- const { window } = new JSDOM2(html);
11107
+ const { window } = createJSDOM(html);
10601
11108
  return window.document;
10602
11109
  }
10603
11110
  };
@@ -11566,11 +12073,14 @@ var DocumentManagementService = class {
11566
12073
  };
11567
12074
 
11568
12075
  export {
12076
+ DEFAULT_MAX_PAGES,
12077
+ DEFAULT_MAX_DEPTH,
12078
+ DEFAULT_MAX_CONCURRENCY,
11569
12079
  setLogLevel,
11570
12080
  logger,
11571
12081
  HttpFetcher,
11572
12082
  FileFetcher,
11573
- HtmlProcessor,
12083
+ ScrapeMode,
11574
12084
  PipelineJobStatus,
11575
12085
  PipelineManager,
11576
12086
  CancelJobTool,
@@ -11585,4 +12095,4 @@ export {
11585
12095
  SearchTool,
11586
12096
  DocumentManagementService
11587
12097
  };
11588
- //# sourceMappingURL=chunk-ADZQJG2M.js.map
12098
+ //# sourceMappingURL=chunk-VTO2ED43.js.map