@arabold/docs-mcp-server 1.9.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -292,215 +292,13 @@ var FileFetcher = class {
292
292
  }
293
293
  };
294
294
 
295
- // src/scraper/processor/HtmlProcessor.ts
296
- import createDOMPurify from "dompurify";
297
- import { JSDOM } from "jsdom";
298
- import TurndownService from "turndown";
299
- var HtmlProcessor = class {
300
- turndownService;
301
- options;
302
- selectorsToRemove = [
303
- "nav",
304
- "footer",
305
- "script",
306
- "style",
307
- "noscript",
308
- "svg",
309
- "link",
310
- "meta",
311
- "iframe",
312
- "header",
313
- "button",
314
- "input",
315
- "textarea",
316
- "select",
317
- // "form", // Known issue: Some pages use alerts for important content
318
- ".ads",
319
- ".advertisement",
320
- ".banner",
321
- ".cookie-banner",
322
- ".cookie-consent",
323
- ".hidden",
324
- ".hide",
325
- ".modal",
326
- ".nav-bar",
327
- ".overlay",
328
- ".popup",
329
- ".promo",
330
- ".mw-editsection",
331
- ".side-bar",
332
- ".social-share",
333
- ".sticky",
334
- "#ads",
335
- "#banner",
336
- "#cookieBanner",
337
- "#modal",
338
- "#nav",
339
- "#overlay",
340
- "#popup",
341
- "#sidebar",
342
- "#socialMediaBox",
343
- "#stickyHeader",
344
- "#ad-container",
345
- ".ad-container",
346
- ".login-form",
347
- ".signup-form",
348
- ".tooltip",
349
- ".dropdown-menu",
350
- // ".alert", // Known issue: Some pages use alerts for important content
351
- ".breadcrumb",
352
- ".pagination",
353
- // '[role="alert"]', // Known issue: Some pages use alerts for important content
354
- '[role="banner"]',
355
- '[role="dialog"]',
356
- '[role="alertdialog"]',
357
- '[role="region"][aria-label*="skip" i]',
358
- '[aria-modal="true"]',
359
- ".noprint"
360
- ];
361
- constructor(options) {
362
- this.turndownService = new TurndownService({
363
- headingStyle: "atx",
364
- hr: "---",
365
- bulletListMarker: "-",
366
- codeBlockStyle: "fenced",
367
- emDelimiter: "_",
368
- strongDelimiter: "**",
369
- linkStyle: "inlined"
370
- });
371
- this.turndownService.addRule("pre", {
372
- filter: ["pre"],
373
- replacement: (content3, node2) => {
374
- const element = node2;
375
- let language = element.getAttribute("data-language") || "";
376
- if (!language) {
377
- const highlightElement = element.closest(
378
- '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
379
- );
380
- if (highlightElement) {
381
- const className = highlightElement.className;
382
- const match = className.match(
383
- /(?:highlight-source-|highlight-|language-)(\w+)/
384
- );
385
- if (match) {
386
- language = match[1];
387
- }
388
- }
389
- }
390
- const text3 = (() => {
391
- const clone = element.cloneNode(true);
392
- const brElements = Array.from(clone.querySelectorAll("br"));
393
- for (const br of brElements) {
394
- br.replaceWith("\n");
395
- }
396
- return clone.textContent;
397
- })();
398
- return `
399
- \`\`\`${language}
400
- ${text3}
401
- \`\`\`
402
- `;
403
- }
404
- });
405
- this.turndownService.addRule("table", {
406
- filter: ["table"],
407
- replacement: (content3) => {
408
- const cleanedContent = content3.replace(/\n+/g, "\n");
409
- return `
410
-
411
- ${cleanedContent}
412
-
413
- `;
414
- }
415
- });
416
- this.options = options || {};
417
- }
418
- canProcess(content3) {
419
- return content3.mimeType.startsWith("text/html");
420
- }
421
- async process(content3) {
422
- if (!this.canProcess(content3)) {
423
- throw new ScraperError(
424
- `HtmlProcessor cannot process content of type ${content3.mimeType}`,
425
- false
426
- );
427
- }
428
- const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
429
- const window = new JSDOM(htmlContent, { url: content3.source }).window;
430
- const title = window.document.title || "Untitled";
431
- const purify = createDOMPurify(window);
432
- const purifiedContent = purify.sanitize(htmlContent, {
433
- WHOLE_DOCUMENT: true,
434
- RETURN_DOM: true
435
- });
436
- const linkElements = purifiedContent.querySelectorAll("a[href]");
437
- let links = [];
438
- if (this.options.extractLinks !== false) {
439
- links = Array.from(linkElements).map((el) => el.getAttribute("href")).filter((href) => href !== null).map((href) => {
440
- try {
441
- return new URL(href, content3.source).href;
442
- } catch {
443
- return null;
444
- }
445
- }).filter((url) => url !== null);
446
- }
447
- const selectorsToRemove = [
448
- ...this.options.excludeSelectors || [],
449
- ...this.selectorsToRemove
450
- ];
451
- for (const selector of selectorsToRemove) {
452
- const elements = purifiedContent.querySelectorAll(selector);
453
- for (const el of elements) {
454
- el.remove();
455
- }
456
- }
457
- const cleanedContent = purifiedContent.innerHTML;
458
- const markdown = this.turndownService.turndown(cleanedContent || "").trim();
459
- if (!markdown) {
460
- throw new ScraperError("No valid content found", false);
461
- }
462
- return {
463
- content: markdown,
464
- title,
465
- source: content3.source,
466
- links,
467
- metadata: {}
468
- };
469
- }
470
- };
471
-
472
- // src/scraper/processor/MarkdownProcessor.ts
473
- var MarkdownProcessor = class {
474
- canProcess(content3) {
475
- return content3.mimeType === "text/markdown" || content3.mimeType === "text/plain" || // Treat plain text as markdown
476
- content3.source.endsWith(".md");
477
- }
478
- async process(content3) {
479
- if (!this.canProcess(content3)) {
480
- throw new ScraperError(
481
- `MarkdownProcessor cannot process content of type ${content3.mimeType}`,
482
- false
483
- );
484
- }
485
- const markdownContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
486
- if (!markdownContent.trim()) {
487
- throw new ScraperError("Empty Markdown content", false);
488
- }
489
- const title = this.extractTitle(markdownContent) || "Untitled";
490
- return {
491
- content: markdownContent,
492
- title,
493
- source: content3.source,
494
- links: [],
495
- // TODO: Extract links from Markdown
496
- metadata: {}
497
- };
498
- }
499
- extractTitle(markdown) {
500
- const match = markdown.match(/^#\s+(.*)$/m);
501
- return match ? match[1].trim() : null;
502
- }
503
- };
295
+ // src/scraper/types.ts
296
+ var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
297
+ ScrapeMode2["Fetch"] = "fetch";
298
+ ScrapeMode2["Playwright"] = "playwright";
299
+ ScrapeMode2["Auto"] = "auto";
300
+ return ScrapeMode2;
301
+ })(ScrapeMode || {});
504
302
 
505
303
  // node_modules/uuid/dist/esm-node/stringify.js
506
304
  var byteToHex = [];
@@ -609,6 +407,541 @@ function isSubpath(baseUrl, targetUrl) {
609
407
  return targetUrl.pathname.startsWith(basePath);
610
408
  }
611
409
 
410
+ // src/scraper/middleware/ContentProcessorPipeline.ts
411
+ var ContentProcessingPipeline = class {
412
+ middleware;
413
+ /**
414
+ * Creates an instance of ContentProcessingPipeline.
415
+ * @param middleware An array of middleware instances to execute in order.
416
+ */
417
+ constructor(middleware) {
418
+ this.middleware = middleware;
419
+ }
420
+ /**
421
+ * Executes the middleware pipeline with the given initial context.
422
+ * @param initialContext The starting context for the pipeline.
423
+ * @returns A promise that resolves with the final context after all middleware have executed.
424
+ */
425
+ async run(initialContext) {
426
+ let index2 = -1;
427
+ const dispatch = async (i) => {
428
+ if (i <= index2) {
429
+ throw new Error("next() called multiple times");
430
+ }
431
+ index2 = i;
432
+ const mw = this.middleware[i];
433
+ if (!mw) {
434
+ return;
435
+ }
436
+ const next = dispatch.bind(null, i + 1);
437
+ try {
438
+ await mw.process(initialContext, next);
439
+ } catch (error) {
440
+ initialContext.errors.push(
441
+ error instanceof Error ? error : new Error(String(error))
442
+ );
443
+ logger.warn(`Error in middleware pipeline: ${error}`);
444
+ }
445
+ };
446
+ await dispatch(0);
447
+ return initialContext;
448
+ }
449
+ };
450
+
451
+ // src/scraper/middleware/components/HtmlCheerioParserMiddleware.ts
452
+ import * as cheerio from "cheerio";
453
+ var HtmlCheerioParserMiddleware = class {
454
+ async process(context, next) {
455
+ if (!context.contentType.startsWith("text/html")) {
456
+ await next();
457
+ return;
458
+ }
459
+ const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
460
+ try {
461
+ logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
462
+ const $ = cheerio.load(htmlString);
463
+ context.dom = $;
464
+ await next();
465
+ } catch (error) {
466
+ logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
467
+ context.errors.push(
468
+ error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
469
+ );
470
+ return;
471
+ }
472
+ }
473
+ };
474
+
475
+ // src/utils/dom.ts
476
+ import { JSDOM, VirtualConsole } from "jsdom";
477
+ function createJSDOM(html, options) {
478
+ const virtualConsole = new VirtualConsole();
479
+ virtualConsole.on("error", () => {
480
+ });
481
+ virtualConsole.on("warn", () => {
482
+ });
483
+ virtualConsole.on("info", () => {
484
+ });
485
+ virtualConsole.on("debug", () => {
486
+ });
487
+ virtualConsole.on("log", () => {
488
+ });
489
+ const defaultOptions = {
490
+ virtualConsole
491
+ };
492
+ const finalOptions = { ...defaultOptions, ...options };
493
+ return new JSDOM(html, finalOptions);
494
+ }
495
+
496
+ // src/scraper/middleware/components/HtmlLinkExtractorMiddleware.ts
497
+ var HtmlLinkExtractorMiddleware = class {
498
+ /**
499
+ * Processes the context to extract links from the sanitized HTML body.
500
+ * @param context The current processing context.
501
+ * @param next Function to call the next middleware.
502
+ */
503
+ async process(context, next) {
504
+ const $ = context.dom;
505
+ if (!$) {
506
+ if (context.contentType.startsWith("text/html")) {
507
+ logger.warn(
508
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
509
+ );
510
+ }
511
+ await next();
512
+ return;
513
+ }
514
+ try {
515
+ const linkElements = $("a[href]");
516
+ logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
517
+ const extractedLinks = [];
518
+ linkElements.each((index2, element) => {
519
+ const href = $(element).attr("href");
520
+ if (href && href.trim() !== "") {
521
+ try {
522
+ const urlObj = new URL(href, context.source);
523
+ if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
524
+ logger.debug(`Ignoring link with invalid protocol: ${href}`);
525
+ return;
526
+ }
527
+ extractedLinks.push(urlObj.href);
528
+ } catch (e) {
529
+ logger.debug(`Ignoring invalid URL syntax: ${href}`);
530
+ }
531
+ }
532
+ });
533
+ context.links = [...new Set(extractedLinks)];
534
+ logger.debug(
535
+ `Extracted ${context.links.length} unique, valid links from ${context.source}`
536
+ );
537
+ } catch (error) {
538
+ logger.error(`Error extracting links from ${context.source}: ${error}`);
539
+ context.errors.push(
540
+ new Error(
541
+ `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
542
+ )
543
+ );
544
+ }
545
+ await next();
546
+ }
547
+ };
548
+
549
+ // src/scraper/middleware/components/HtmlMetadataExtractorMiddleware.ts
550
+ var HtmlMetadataExtractorMiddleware = class {
551
+ /**
552
+ * Processes the context to extract the HTML title.
553
+ * @param context The current processing context.
554
+ * @param next Function to call the next middleware.
555
+ */
556
+ async process(context, next) {
557
+ const $ = context.dom;
558
+ if (!$) {
559
+ if (context.contentType.startsWith("text/html")) {
560
+ logger.warn(
561
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
562
+ );
563
+ }
564
+ await next();
565
+ return;
566
+ }
567
+ try {
568
+ let title = $("title").first().text().trim();
569
+ if (!title) {
570
+ title = $("h1").first().text().trim();
571
+ }
572
+ title = title || "Untitled";
573
+ title = title.replace(/\s+/g, " ").trim();
574
+ context.metadata.title = title;
575
+ logger.debug(`Extracted title: "${title}" from ${context.source}`);
576
+ } catch (error) {
577
+ logger.error(`Error extracting metadata from ${context.source}: ${error}`);
578
+ context.errors.push(
579
+ new Error(
580
+ `Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
581
+ )
582
+ );
583
+ }
584
+ await next();
585
+ }
586
+ };
587
+
588
+ // src/scraper/middleware/components/HtmlPlaywrightMiddleware.ts
589
+ import { chromium } from "playwright";
590
+ var HtmlPlaywrightMiddleware = class {
591
+ browser = null;
592
+ /**
593
+ * Initializes the Playwright browser instance.
594
+ * Consider making this more robust (e.g., lazy initialization, singleton).
595
+ */
596
+ async ensureBrowser() {
597
+ if (!this.browser || !this.browser.isConnected()) {
598
+ const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
599
+ logger.debug(
600
+ `Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
601
+ );
602
+ this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
603
+ this.browser.on("disconnected", () => {
604
+ logger.debug("Playwright browser instance disconnected.");
605
+ this.browser = null;
606
+ });
607
+ }
608
+ return this.browser;
609
+ }
610
+ /**
611
+ * Closes the Playwright browser instance if it exists.
612
+ * Should be called during application shutdown.
613
+ */
614
+ async closeBrowser() {
615
+ if (this.browser?.isConnected()) {
616
+ logger.debug("Closing Playwright browser instance...");
617
+ await this.browser.close();
618
+ this.browser = null;
619
+ }
620
+ }
621
+ async process(context, next) {
622
+ if (!context.contentType.startsWith("text/html")) {
623
+ await next();
624
+ return;
625
+ }
626
+ const scrapeMode = context.options?.scrapeMode ?? "auto" /* Auto */;
627
+ const shouldRunPlaywright = scrapeMode === "playwright" /* Playwright */ || scrapeMode === "auto" /* Auto */;
628
+ if (!shouldRunPlaywright) {
629
+ logger.debug(
630
+ `Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
631
+ );
632
+ await next();
633
+ return;
634
+ }
635
+ logger.debug(
636
+ `Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
637
+ );
638
+ let page = null;
639
+ let renderedHtml = null;
640
+ try {
641
+ const browser = await this.ensureBrowser();
642
+ page = await browser.newPage();
643
+ logger.debug(`Playwright: Processing ${context.source}`);
644
+ await page.route("**/*", (route) => {
645
+ if (route.request().url() === context.source) {
646
+ return route.fulfill({
647
+ status: 200,
648
+ contentType: context.contentType,
649
+ body: context.content
650
+ });
651
+ }
652
+ const resourceType = route.request().resourceType();
653
+ if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
654
+ return route.abort();
655
+ }
656
+ return route.continue();
657
+ });
658
+ await page.goto(context.source, {
659
+ waitUntil: "load"
660
+ });
661
+ renderedHtml = await page.content();
662
+ logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
663
+ } catch (error) {
664
+ logger.error(`Playwright failed to render ${context.source}: ${error}`);
665
+ context.errors.push(
666
+ error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
667
+ );
668
+ } finally {
669
+ if (page) {
670
+ await page.unroute("**/*");
671
+ await page.close();
672
+ }
673
+ }
674
+ if (renderedHtml !== null) {
675
+ context.content = renderedHtml;
676
+ logger.debug(
677
+ `Playwright middleware updated content for ${context.source}. Proceeding.`
678
+ );
679
+ } else {
680
+ logger.warn(
681
+ `Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
682
+ );
683
+ }
684
+ await next();
685
+ }
686
+ };
687
+
688
+ // src/scraper/middleware/components/HtmlSanitizerMiddleware.ts
689
+ var HtmlSanitizerMiddleware = class {
690
+ // Default selectors to remove
691
+ defaultSelectorsToRemove = [
692
+ "nav",
693
+ "footer",
694
+ "script",
695
+ "style",
696
+ "noscript",
697
+ "svg",
698
+ "link",
699
+ "meta",
700
+ "iframe",
701
+ "header",
702
+ "button",
703
+ "input",
704
+ "textarea",
705
+ "select",
706
+ // "form", // Keep commented
707
+ ".ads",
708
+ ".advertisement",
709
+ ".banner",
710
+ ".cookie-banner",
711
+ ".cookie-consent",
712
+ ".hidden",
713
+ ".hide",
714
+ ".modal",
715
+ ".nav-bar",
716
+ ".overlay",
717
+ ".popup",
718
+ ".promo",
719
+ ".mw-editsection",
720
+ ".side-bar",
721
+ ".social-share",
722
+ ".sticky",
723
+ "#ads",
724
+ "#banner",
725
+ "#cookieBanner",
726
+ "#modal",
727
+ "#nav",
728
+ "#overlay",
729
+ "#popup",
730
+ "#sidebar",
731
+ "#socialMediaBox",
732
+ "#stickyHeader",
733
+ "#ad-container",
734
+ ".ad-container",
735
+ ".login-form",
736
+ ".signup-form",
737
+ ".tooltip",
738
+ ".dropdown-menu",
739
+ // ".alert", // Keep commented
740
+ ".breadcrumb",
741
+ ".pagination",
742
+ // '[role="alert"]', // Keep commented
743
+ '[role="banner"]',
744
+ '[role="dialog"]',
745
+ '[role="alertdialog"]',
746
+ '[role="region"][aria-label*="skip" i]',
747
+ '[aria-modal="true"]',
748
+ ".noprint"
749
+ ];
750
+ async process(context, next) {
751
+ const $ = context.dom;
752
+ if (!$) {
753
+ if (context.contentType.startsWith("text/html")) {
754
+ logger.warn(
755
+ `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
756
+ );
757
+ }
758
+ await next();
759
+ return;
760
+ }
761
+ try {
762
+ const selectorsToRemove = [
763
+ ...context.options.excludeSelectors || [],
764
+ // Use options from the context
765
+ ...this.defaultSelectorsToRemove
766
+ ];
767
+ logger.debug(
768
+ `Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
769
+ );
770
+ let removedCount = 0;
771
+ for (const selector of selectorsToRemove) {
772
+ try {
773
+ const elements = $(selector);
774
+ const count = elements.length;
775
+ if (count > 0) {
776
+ elements.remove();
777
+ removedCount += count;
778
+ }
779
+ } catch (selectorError) {
780
+ logger.warn(
781
+ `Potentially invalid selector "${selector}" during element removal: ${selectorError}`
782
+ );
783
+ context.errors.push(
784
+ new Error(`Invalid selector "${selector}": ${selectorError}`)
785
+ );
786
+ }
787
+ }
788
+ logger.debug(`Removed ${removedCount} elements for ${context.source}`);
789
+ } catch (error) {
790
+ logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
791
+ context.errors.push(
792
+ error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
793
+ );
794
+ }
795
+ await next();
796
+ }
797
+ };
798
+
799
+ // src/scraper/middleware/components/HtmlToMarkdownMiddleware.ts
800
+ import { gfm } from "@joplin/turndown-plugin-gfm";
801
+ import TurndownService from "turndown";
802
+ var HtmlToMarkdownMiddleware = class {
803
+ turndownService;
804
+ constructor() {
805
+ this.turndownService = new TurndownService({
806
+ headingStyle: "atx",
807
+ hr: "---",
808
+ bulletListMarker: "-",
809
+ codeBlockStyle: "fenced",
810
+ emDelimiter: "_",
811
+ strongDelimiter: "**",
812
+ linkStyle: "inlined"
813
+ });
814
+ this.turndownService.use(gfm);
815
+ this.addCustomRules();
816
+ }
817
+ addCustomRules() {
818
+ this.turndownService.addRule("pre", {
819
+ filter: ["pre"],
820
+ replacement: (content3, node2) => {
821
+ const element = node2;
822
+ let language = element.getAttribute("data-language") || "";
823
+ if (!language) {
824
+ const highlightElement = element.closest(
825
+ '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
826
+ ) || element.querySelector(
827
+ '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
828
+ );
829
+ if (highlightElement) {
830
+ const className = highlightElement.className;
831
+ const match = className.match(
832
+ /(?:highlight-source-|highlight-|language-)(\w+)/
833
+ );
834
+ if (match) language = match[1];
835
+ }
836
+ }
837
+ const brElements = element.querySelectorAll("br");
838
+ if (brElements.length > 0) {
839
+ for (const br of brElements) {
840
+ br.replaceWith("\n");
841
+ }
842
+ }
843
+ const text3 = element.textContent || "";
844
+ return `
845
+ \`\`\`${language}
846
+ ${text3.replace(/^\n+|\n+$/g, "")}
847
+ \`\`\`
848
+ `;
849
+ }
850
+ });
851
+ }
852
+ /**
853
+ * Processes the context to convert the sanitized HTML body node to Markdown.
854
+ * @param context The current processing context.
855
+ * @param next Function to call the next middleware.
856
+ */
857
+ async process(context, next) {
858
+ const $ = context.dom;
859
+ if (!$) {
860
+ if (context.contentType.startsWith("text/html")) {
861
+ logger.warn(
862
+ `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
863
+ );
864
+ }
865
+ await next();
866
+ return;
867
+ }
868
+ try {
869
+ logger.debug(`Converting HTML content to Markdown for ${context.source}`);
870
+ const htmlToConvert = $("body").html() || $.html();
871
+ const markdown = this.turndownService.turndown(htmlToConvert).trim();
872
+ if (!markdown) {
873
+ const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
874
+ logger.warn(warnMsg);
875
+ context.content = "";
876
+ context.contentType = "text/markdown";
877
+ } else {
878
+ context.content = markdown;
879
+ context.contentType = "text/markdown";
880
+ logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
881
+ }
882
+ } catch (error) {
883
+ logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
884
+ context.errors.push(
885
+ new Error(
886
+ `Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
887
+ )
888
+ );
889
+ }
890
+ await next();
891
+ }
892
+ };
893
+
894
+ // src/scraper/middleware/components/MarkdownLinkExtractorMiddleware.ts
895
+ var MarkdownLinkExtractorMiddleware = class {
896
+ /**
897
+ * Processes the context. Currently a no-op regarding link extraction.
898
+ * @param context The current processing context.
899
+ * @param next Function to call the next middleware.
900
+ */
901
+ async process(context, next) {
902
+ if (context.contentType === "text/markdown") {
903
+ if (!Array.isArray(context.links)) {
904
+ context.links = [];
905
+ }
906
+ }
907
+ await next();
908
+ }
909
+ };
910
+
911
+ // src/scraper/middleware/components/MarkdownMetadataExtractorMiddleware.ts
912
+ var MarkdownMetadataExtractorMiddleware = class {
913
+ /**
914
+ * Processes the context to extract the title from Markdown.
915
+ * @param context The current processing context.
916
+ * @param next Function to call the next middleware.
917
+ */
918
+ async process(context, next) {
919
+ if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
920
+ try {
921
+ const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
922
+ if (typeof context.content !== "string") {
923
+ context.content = textContent;
924
+ }
925
+ let title = "Untitled";
926
+ if (context.contentType === "text/markdown") {
927
+ const match = textContent.match(/^#\s+(.*)$/m);
928
+ if (match?.[1]) {
929
+ title = match[1].trim();
930
+ }
931
+ }
932
+ context.metadata.title = title;
933
+ } catch (error) {
934
+ context.errors.push(
935
+ new Error(
936
+ `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
937
+ )
938
+ );
939
+ }
940
+ }
941
+ await next();
942
+ }
943
+ };
944
+
612
945
  // src/scraper/strategies/BaseScraperStrategy.ts
613
946
  import { URL as URL2 } from "node:url";
614
947
 
@@ -643,12 +976,7 @@ var BaseScraperStrategy = class {
643
976
  constructor(options = {}) {
644
977
  this.options = options;
645
978
  }
646
- getProcessor(mimeType) {
647
- if (mimeType.startsWith("text/html")) {
648
- return new HtmlProcessor();
649
- }
650
- return new MarkdownProcessor();
651
- }
979
+ // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
652
980
  async processBatch(batch, baseUrl, options, progressCallback, signal) {
653
981
  const results = await Promise.all(
654
982
  batch.map(async (item) => {
@@ -749,9 +1077,12 @@ var BaseScraperStrategy = class {
749
1077
  var WebScraperStrategy = class extends BaseScraperStrategy {
750
1078
  httpFetcher = new HttpFetcher();
751
1079
  shouldFollowLinkFn;
1080
+ playwrightMiddleware;
1081
+ // Add member
752
1082
  constructor(options = {}) {
753
1083
  super({ urlNormalizerOptions: options.urlNormalizerOptions });
754
1084
  this.shouldFollowLinkFn = options.shouldFollowLink;
1085
+ this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
755
1086
  }
756
1087
  canHandle(url) {
757
1088
  try {
@@ -785,12 +1116,56 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
785
1116
  followRedirects: options.followRedirects
786
1117
  };
787
1118
  const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
788
- const processor = this.getProcessor(rawContent.mimeType);
789
- const result = await processor.process(rawContent);
1119
+ const initialContext = {
1120
+ content: rawContent.content,
1121
+ contentType: rawContent.mimeType,
1122
+ source: rawContent.source,
1123
+ // Use the final source URL after redirects
1124
+ metadata: {},
1125
+ links: [],
1126
+ errors: [],
1127
+ options,
1128
+ fetcher: this.httpFetcher
1129
+ };
1130
+ let pipeline;
1131
+ if (initialContext.contentType.startsWith("text/html")) {
1132
+ const htmlPipelineSteps = [
1133
+ this.playwrightMiddleware,
1134
+ // Use the instance member
1135
+ // TODO: Add HtmlJsExecutorMiddleware here if needed based on options
1136
+ new HtmlCheerioParserMiddleware(),
1137
+ // Always runs after content is finalized
1138
+ new HtmlMetadataExtractorMiddleware(),
1139
+ new HtmlLinkExtractorMiddleware(),
1140
+ new HtmlSanitizerMiddleware(),
1141
+ // Element remover
1142
+ new HtmlToMarkdownMiddleware()
1143
+ ];
1144
+ pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
1145
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
1146
+ pipeline = new ContentProcessingPipeline([
1147
+ new MarkdownMetadataExtractorMiddleware(),
1148
+ new MarkdownLinkExtractorMiddleware()
1149
+ // Placeholder for now
1150
+ ]);
1151
+ } else {
1152
+ logger.warn(
1153
+ `Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
1154
+ );
1155
+ return { document: void 0, links: [] };
1156
+ }
1157
+ const finalContext = await pipeline.run(initialContext);
1158
+ for (const err of finalContext.errors) {
1159
+ logger.warn(`Processing error for ${url}: ${err.message}`);
1160
+ }
1161
+ if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
1162
+ logger.warn(`No processable content found for ${url} after pipeline execution.`);
1163
+ return { document: void 0, links: finalContext.links };
1164
+ }
790
1165
  const baseUrl = new URL(options.url);
791
- const links = result.links.filter((link) => {
1166
+ const filteredLinks = finalContext.links.filter((link) => {
792
1167
  try {
793
- const targetUrl = new URL(link, baseUrl);
1168
+ const targetUrl = new URL(link);
794
1169
  const scope = options.scope || "subpages";
795
1170
  return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
796
1171
  } catch {
@@ -799,21 +1174,37 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
799
1174
  });
800
1175
  return {
801
1176
  document: {
802
- content: result.content,
1177
+ content: finalContext.content,
1178
+ // Final processed content (Markdown)
803
1179
  metadata: {
804
- url: result.source,
805
- title: result.title,
1180
+ url: finalContext.source,
1181
+ // URL after redirects
1182
+ // Ensure title is a string, default to "Untitled"
1183
+ title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
806
1184
  library: options.library,
807
1185
  version: options.version
1186
+ // Add other metadata from context if needed
808
1187
  }
809
1188
  },
810
- links
1189
+ links: filteredLinks
1190
+ // Use the filtered links
811
1191
  };
812
1192
  } catch (error) {
813
- logger.error(`Failed to scrape page ${url}: ${error}`);
1193
+ logger.error(`Failed processing page ${url}: ${error}`);
814
1194
  throw error;
815
1195
  }
816
1196
  }
1197
+ /**
1198
+ * Overrides the base scrape method to ensure the Playwright browser is closed
1199
+ * after the scraping process completes or errors out.
1200
+ */
1201
+ async scrape(options, progressCallback, signal) {
1202
+ try {
1203
+ await super.scrape(options, progressCallback, signal);
1204
+ } finally {
1205
+ await this.playwrightMiddleware.closeBrowser();
1206
+ }
1207
+ }
817
1208
  };
818
1209
 
819
1210
  // src/scraper/strategies/GitHubScraperStrategy.ts
@@ -883,18 +1274,58 @@ var LocalFileStrategy = class extends BaseScraperStrategy {
883
1274
  }
884
1275
  logger.info(`\u{1F4C4} Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
885
1276
  const rawContent = await this.fileFetcher.fetch(item.url);
886
- const processor = this.getProcessor(rawContent.mimeType);
887
- const result = await processor.process(rawContent);
1277
+ const initialContext = {
1278
+ content: rawContent.content,
1279
+ contentType: rawContent.mimeType,
1280
+ source: rawContent.source,
1281
+ // file:// URL
1282
+ metadata: {},
1283
+ links: [],
1284
+ // LocalFileStrategy doesn't extract links from file content itself
1285
+ errors: [],
1286
+ options
1287
+ // Pass the full options object
1288
+ };
1289
+ let pipeline;
1290
+ if (initialContext.contentType.startsWith("text/html")) {
1291
+ pipeline = new ContentProcessingPipeline([
1292
+ new HtmlCheerioParserMiddleware(),
1293
+ new HtmlMetadataExtractorMiddleware(),
1294
+ // No HtmlLinkExtractorMiddleware needed for local files
1295
+ new HtmlSanitizerMiddleware(),
1296
+ new HtmlToMarkdownMiddleware()
1297
+ ]);
1298
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
1299
+ initialContext.contentType.startsWith("text/")) {
1300
+ pipeline = new ContentProcessingPipeline([
1301
+ new MarkdownMetadataExtractorMiddleware()
1302
+ // No MarkdownLinkExtractorMiddleware needed for local files
1303
+ ]);
1304
+ } else {
1305
+ logger.warn(
1306
+ `Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
1307
+ );
1308
+ return { document: void 0, links: [] };
1309
+ }
1310
+ const finalContext = await pipeline.run(initialContext);
1311
+ for (const err of finalContext.errors) {
1312
+ logger.warn(`Processing error for ${filePath}: ${err.message}`);
1313
+ }
1314
+ const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
888
1315
  return {
889
1316
  document: {
890
- content: result.content,
1317
+ // Use the potentially empty string content
1318
+ content: finalContentString,
891
1319
  metadata: {
892
- url: item.url,
893
- title: result.title,
1320
+ url: finalContext.source,
1321
+ // Use context source (file:// URL)
1322
+ // Ensure title is a string, default to "Untitled"
1323
+ title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
894
1324
  library: options.library,
895
1325
  version: options.version
896
1326
  }
897
1327
  }
1328
+ // No links returned from file content processing
898
1329
  };
899
1330
  }
900
1331
  async scrape(options, progressCallback, signal) {
@@ -1007,7 +1438,7 @@ var PipelineWorker = class {
1007
1438
  async executeJob(job, callbacks) {
1008
1439
  const { id: jobId, library, version, options, abortController } = job;
1009
1440
  const signal = abortController.signal;
1010
- logger.info(`[${jobId}] Worker starting job for ${library}@${version}`);
1441
+ logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
1011
1442
  try {
1012
1443
  await this.scraperService.scrape(
1013
1444
  options,
@@ -1327,14 +1758,13 @@ var LibraryNotFoundError = class extends ToolError {
1327
1758
 
1328
1759
  // src/tools/FetchUrlTool.ts
1329
1760
  var FetchUrlTool = class {
1330
- constructor(httpFetcher, fileFetcher, processor) {
1331
- this.processor = processor;
1332
- this.fetchers = [httpFetcher, fileFetcher];
1333
- }
1334
1761
  /**
1335
1762
  * Collection of fetchers that will be tried in order for a given URL.
1336
1763
  */
1337
1764
  fetchers;
1765
+ constructor(httpFetcher, fileFetcher) {
1766
+ this.fetchers = [httpFetcher, fileFetcher];
1767
+ }
1338
1768
  /**
1339
1769
  * Fetches content from a URL and converts it to Markdown.
1340
1770
  * Supports both HTTP/HTTPS URLs and local file URLs (file://).
@@ -1342,7 +1772,7 @@ var FetchUrlTool = class {
1342
1772
  * @throws {ToolError} If fetching or processing fails
1343
1773
  */
1344
1774
  async execute(options) {
1345
- const { url } = options;
1775
+ const { url, scrapeMode = "auto" /* Auto */ } = options;
1346
1776
  const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
1347
1777
  const fetcherIndex = canFetchResults.findIndex((result) => result === true);
1348
1778
  if (fetcherIndex === -1) {
@@ -1352,18 +1782,88 @@ var FetchUrlTool = class {
1352
1782
  );
1353
1783
  }
1354
1784
  const fetcher = this.fetchers[fetcherIndex];
1785
+ const playwrightMiddleware = new HtmlPlaywrightMiddleware();
1355
1786
  try {
1356
1787
  logger.info(`\u{1F4E1} Fetching ${url}...`);
1357
1788
  const rawContent = await fetcher.fetch(url, {
1358
1789
  followRedirects: options.followRedirects ?? true,
1359
1790
  maxRetries: 3
1791
+ // Keep retries for fetching
1360
1792
  });
1361
- logger.info("\u{1F504} Converting to Markdown...");
1362
- const processed = await this.processor.process(rawContent);
1363
- logger.info(`\u2705 Successfully converted ${url} to Markdown`);
1364
- return processed.content;
1793
+ logger.info("\u{1F504} Processing content...");
1794
+ const initialContext = {
1795
+ content: rawContent.content,
1796
+ contentType: rawContent.mimeType,
1797
+ source: rawContent.source,
1798
+ metadata: {},
1799
+ links: [],
1800
+ // Links not needed for this tool's output
1801
+ errors: [],
1802
+ fetcher,
1803
+ // Create a minimal ScraperOptions object for the context
1804
+ options: {
1805
+ url,
1806
+ // Use the input URL
1807
+ library: "",
1808
+ // Not applicable for this tool
1809
+ version: "",
1810
+ // Use empty string instead of undefined
1811
+ // Default other options as needed by middleware
1812
+ maxDepth: 0,
1813
+ maxPages: 1,
1814
+ maxConcurrency: 1,
1815
+ scope: "subpages",
1816
+ // Default, though not used for single page fetch
1817
+ followRedirects: options.followRedirects ?? true,
1818
+ excludeSelectors: void 0,
1819
+ // Not currently configurable via this tool
1820
+ ignoreErrors: false,
1821
+ scrapeMode
1822
+ // Pass the scrapeMode
1823
+ }
1824
+ };
1825
+ let pipeline;
1826
+ if (initialContext.contentType.startsWith("text/html")) {
1827
+ const htmlPipelineSteps = [
1828
+ playwrightMiddleware,
1829
+ // Use the instantiated middleware
1830
+ new HtmlCheerioParserMiddleware(),
1831
+ // Always runs after content is finalized
1832
+ new HtmlMetadataExtractorMiddleware(),
1833
+ // Keep for potential future use
1834
+ // No Link Extractor needed for this tool
1835
+ new HtmlSanitizerMiddleware(),
1836
+ // Element remover
1837
+ new HtmlToMarkdownMiddleware()
1838
+ ];
1839
+ pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
1840
+ } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
1841
+ pipeline = new ContentProcessingPipeline([
1842
+ new MarkdownMetadataExtractorMiddleware()
1843
+ // Extract title (though not used)
1844
+ // No further processing needed for Markdown/Plain text for this tool
1845
+ ]);
1846
+ } else {
1847
+ logger.warn(
1848
+ `Unsupported content type "${initialContext.contentType}" for ${url}. Returning raw content.`
1849
+ );
1850
+ const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
1851
+ return contentString;
1852
+ }
1853
+ const finalContext = await pipeline.run(initialContext);
1854
+ for (const err of finalContext.errors) {
1855
+ logger.warn(`Processing error for ${url}: ${err.message}`);
1856
+ }
1857
+ if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
1858
+ throw new ToolError(
1859
+ `Processing resulted in empty content for ${url}`,
1860
+ this.constructor.name
1861
+ );
1862
+ }
1863
+ logger.info(`\u2705 Successfully processed ${url}`);
1864
+ return finalContext.content;
1365
1865
  } catch (error) {
1366
- if (error instanceof ScraperError) {
1866
+ if (error instanceof ScraperError || error instanceof ToolError) {
1367
1867
  throw new ToolError(
1368
1868
  `Failed to fetch or process URL: ${error.message}`,
1369
1869
  this.constructor.name
@@ -1373,6 +1873,8 @@ var FetchUrlTool = class {
1373
1873
  `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
1374
1874
  this.constructor.name
1375
1875
  );
1876
+ } finally {
1877
+ await playwrightMiddleware.closeBrowser();
1376
1878
  }
1377
1879
  }
1378
1880
  };
@@ -1496,7 +1998,9 @@ var ScrapeTool = class {
1496
1998
  maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
1497
1999
  maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
1498
2000
  maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
1499
- ignoreErrors: scraperOptions?.ignoreErrors ?? true
2001
+ ignoreErrors: scraperOptions?.ignoreErrors ?? true,
2002
+ scrapeMode: scraperOptions?.scrapeMode ?? "auto" /* Auto */
2003
+ // Pass scrapeMode enum
1500
2004
  });
1501
2005
  logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
1502
2006
  options.onProgress?.({
@@ -1784,7 +2288,6 @@ import Fuse from "fuse.js";
1784
2288
  import semver3 from "semver";
1785
2289
 
1786
2290
  // src/splitter/SemanticMarkdownSplitter.ts
1787
- import { JSDOM as JSDOM2 } from "jsdom";
1788
2291
  import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter";
1789
2292
  import remarkGfm from "remark-gfm";
1790
2293
  import remarkHtml from "remark-html";
@@ -10601,7 +11104,7 @@ ${"```"}`;
10601
11104
  * Parse HTML
10602
11105
  */
10603
11106
  async parseHtml(html) {
10604
- const { window } = new JSDOM2(html);
11107
+ const { window } = createJSDOM(html);
10605
11108
  return window.document;
10606
11109
  }
10607
11110
  };
@@ -11577,7 +12080,7 @@ export {
11577
12080
  logger,
11578
12081
  HttpFetcher,
11579
12082
  FileFetcher,
11580
- HtmlProcessor,
12083
+ ScrapeMode,
11581
12084
  PipelineJobStatus,
11582
12085
  PipelineManager,
11583
12086
  CancelJobTool,
@@ -11592,4 +12095,4 @@ export {
11592
12095
  SearchTool,
11593
12096
  DocumentManagementService
11594
12097
  };
11595
- //# sourceMappingURL=chunk-A5FW7XVC.js.map
12098
+ //# sourceMappingURL=chunk-VTO2ED43.js.map