npm - @arabold/docs-mcp-server - Versions diffs - 1.8.0 → 1.10.0 - Mend

@arabold/docs-mcp-server 1.8.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +6 -2
package/dist/{chunk-ADZQJG2M.js → chunk-VTO2ED43.js} +764 -254
package/dist/chunk-VTO2ED43.js.map +1 -0
package/dist/cli.js +60 -16
package/dist/cli.js.map +1 -1
package/dist/server.js +7 -9
package/dist/server.js.map +1 -1
package/package.json +5 -6
package/dist/chunk-ADZQJG2M.js.map +0 -1

package/dist/{chunk-ADZQJG2M.js → chunk-VTO2ED43.js} RENAMED Viewed

@@ -100,6 +100,11 @@ var require_extend = __commonJS({
   }
 });
+// src/config.ts
+var DEFAULT_MAX_PAGES = 1e3;
+var DEFAULT_MAX_DEPTH = 3;
+var DEFAULT_MAX_CONCURRENCY = 3;
 // src/utils/logger.ts
 var currentLogLevel = 2 /* INFO */;
 function setLogLevel(level) {
@@ -287,216 +292,13 @@ var FileFetcher = class {
   }
 };
-// src/scraper/processor/HtmlProcessor.ts
-import createDOMPurify from "dompurify";
-import { JSDOM } from "jsdom";
-import TurndownService from "turndown";
-var HtmlProcessor = class {
-  turndownService;
-  options;
-  selectorsToRemove = [
-    "nav",
-    "footer",
-    "script",
-    "style",
-    "noscript",
-    "svg",
-    "link",
-    "meta",
-    "iframe",
-    "header",
-    "button",
-    "input",
-    "textarea",
-    "select",
-    // "form", // Known issue: Some pages use alerts for important content
-    ".ads",
-    ".advertisement",
-    ".banner",
-    ".cookie-banner",
-    ".cookie-consent",
-    ".hidden",
-    ".hide",
-    ".modal",
-    ".nav-bar",
-    ".overlay",
-    ".popup",
-    ".promo",
-    ".mw-editsection",
-    ".side-bar",
-    ".social-share",
-    ".sticky",
-    "#ads",
-    "#banner",
-    "#cookieBanner",
-    "#modal",
-    "#nav",
-    "#overlay",
-    "#popup",
-    "#sidebar",
-    "#socialMediaBox",
-    "#stickyHeader",
-    "#ad-container",
-    ".ad-container",
-    ".login-form",
-    ".signup-form",
-    ".tooltip",
-    ".dropdown-menu",
-    // ".alert", // Known issue: Some pages use alerts for important content
-    ".breadcrumb",
-    ".pagination",
-    // '[role="alert"]', // Known issue: Some pages use alerts for important content
-    '[role="banner"]',
-    '[role="dialog"]',
-    '[role="alertdialog"]',
-    '[role="region"][aria-label*="skip" i]',
-    '[aria-modal="true"]',
-    ".noprint"
-  ];
-  constructor(options) {
-    this.turndownService = new TurndownService({
-      headingStyle: "atx",
-      hr: "---",
-      bulletListMarker: "-",
-      codeBlockStyle: "fenced",
-      emDelimiter: "_",
-      strongDelimiter: "**",
-      linkStyle: "inlined"
-    });
-    this.turndownService.addRule("pre", {
-      filter: ["pre"],
-      replacement: (content3, node2) => {
-        const element = node2;
-        let language = element.getAttribute("data-language") || "";
-        if (!language) {
-          const highlightElement = element.closest(
-            '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
-          );
-          if (highlightElement) {
-            const className = highlightElement.className;
-            const match = className.match(
-              /(?:highlight-source-|highlight-|language-)(\w+)/
-            );
-            if (match) {
-              language = match[1];
-            }
-          }
-        }
-        const text3 = (() => {
-          const clone = element.cloneNode(true);
-          const brElements = Array.from(clone.querySelectorAll("br"));
-          for (const br of brElements) {
-            br.replaceWith("\n");
-          }
-          return clone.textContent;
-        })();
-        return `
-\`\`\`${language}
-${text3}
-\`\`\`
-`;
-      }
-    });
-    this.turndownService.addRule("table", {
-      filter: ["table"],
-      replacement: (content3) => {
-        const cleanedContent = content3.replace(/\n+/g, "\n");
-        return `
-${cleanedContent}
-`;
-      }
-    });
-    this.options = options || {};
-  }
-  canProcess(content3) {
-    return content3.mimeType.startsWith("text/html");
-  }
-  async process(content3) {
-    if (!this.canProcess(content3)) {
-      throw new ScraperError(
-        `HtmlProcessor cannot process content of type ${content3.mimeType}`,
-        false
-      );
-    }
-    const htmlContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
-    const titleMatch = htmlContent.match(/<title>([^<]+)<\/title>/i);
-    const title = titleMatch?.[1] || "Untitled";
-    const window = new JSDOM(content3.content, { url: content3.source }).window;
-    const purify = createDOMPurify(window);
-    const purifiedContent = purify.sanitize(htmlContent, {
-      WHOLE_DOCUMENT: true,
-      RETURN_DOM: true
-    });
-    const linkElements = purifiedContent.querySelectorAll("a[href]");
-    let links = [];
-    if (this.options.extractLinks !== false) {
-      links = Array.from(linkElements).map((el) => el.getAttribute("href")).filter((href) => href !== null).map((href) => {
-        try {
-          return new URL(href, content3.source).href;
-        } catch {
-          return null;
-        }
-      }).filter((url) => url !== null);
-    }
-    const selectorsToRemove = [
-      ...this.options.excludeSelectors || [],
-      ...this.selectorsToRemove
-    ];
-    for (const selector of selectorsToRemove) {
-      const elements = purifiedContent.querySelectorAll(selector);
-      for (const el of elements) {
-        el.remove();
-      }
-    }
-    const cleanedContent = purifiedContent.innerHTML;
-    const markdown = this.turndownService.turndown(cleanedContent || "").trim();
-    if (!markdown) {
-      throw new ScraperError("No valid content found", false);
-    }
-    return {
-      content: markdown,
-      title,
-      source: content3.source,
-      links,
-      metadata: {}
-    };
-  }
-};
-// src/scraper/processor/MarkdownProcessor.ts
-var MarkdownProcessor = class {
-  canProcess(content3) {
-    return content3.mimeType === "text/markdown" || content3.mimeType === "text/plain" || // Treat plain text as markdown
-    content3.source.endsWith(".md");
-  }
-  async process(content3) {
-    if (!this.canProcess(content3)) {
-      throw new ScraperError(
-        `MarkdownProcessor cannot process content of type ${content3.mimeType}`,
-        false
-      );
-    }
-    const markdownContent = typeof content3.content === "string" ? content3.content : content3.content.toString(content3.encoding || "utf-8");
-    if (!markdownContent.trim()) {
-      throw new ScraperError("Empty Markdown content", false);
-    }
-    const title = this.extractTitle(markdownContent) || "Untitled";
-    return {
-      content: markdownContent,
-      title,
-      source: content3.source,
-      links: [],
-      // TODO: Extract links from Markdown
-      metadata: {}
-    };
-  }
-  extractTitle(markdown) {
-    const match = markdown.match(/^#\s+(.*)$/m);
-    return match ? match[1].trim() : null;
-  }
-};
+// src/scraper/types.ts
+var ScrapeMode = /* @__PURE__ */ ((ScrapeMode2) => {
+  ScrapeMode2["Fetch"] = "fetch";
+  ScrapeMode2["Playwright"] = "playwright";
+  ScrapeMode2["Auto"] = "auto";
+  return ScrapeMode2;
+})(ScrapeMode || {});
 // node_modules/uuid/dist/esm-node/stringify.js
 var byteToHex = [];
@@ -605,6 +407,541 @@ function isSubpath(baseUrl, targetUrl) {
   return targetUrl.pathname.startsWith(basePath);
 }
+// src/scraper/middleware/ContentProcessorPipeline.ts
+var ContentProcessingPipeline = class {
+  middleware;
+  /**
+   * Creates an instance of ContentProcessingPipeline.
+   * @param middleware An array of middleware instances to execute in order.
+   */
+  constructor(middleware) {
+    this.middleware = middleware;
+  }
+  /**
+   * Executes the middleware pipeline with the given initial context.
+   * @param initialContext The starting context for the pipeline.
+   * @returns A promise that resolves with the final context after all middleware have executed.
+   */
+  async run(initialContext) {
+    let index2 = -1;
+    const dispatch = async (i) => {
+      if (i <= index2) {
+        throw new Error("next() called multiple times");
+      }
+      index2 = i;
+      const mw = this.middleware[i];
+      if (!mw) {
+        return;
+      }
+      const next = dispatch.bind(null, i + 1);
+      try {
+        await mw.process(initialContext, next);
+      } catch (error) {
+        initialContext.errors.push(
+          error instanceof Error ? error : new Error(String(error))
+        );
+        logger.warn(`Error in middleware pipeline: ${error}`);
+      }
+    };
+    await dispatch(0);
+    return initialContext;
+  }
+};
+// src/scraper/middleware/components/HtmlCheerioParserMiddleware.ts
+import * as cheerio from "cheerio";
+var HtmlCheerioParserMiddleware = class {
+  async process(context, next) {
+    if (!context.contentType.startsWith("text/html")) {
+      await next();
+      return;
+    }
+    const htmlString = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
+    try {
+      logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
+      const $ = cheerio.load(htmlString);
+      context.dom = $;
+      await next();
+    } catch (error) {
+      logger.error(`Failed to parse HTML with Cheerio for ${context.source}: ${error}`);
+      context.errors.push(
+        error instanceof Error ? error : new Error(`Cheerio HTML parsing failed: ${String(error)}`)
+      );
+      return;
+    }
+  }
+};
+// src/utils/dom.ts
+import { JSDOM, VirtualConsole } from "jsdom";
+function createJSDOM(html, options) {
+  const virtualConsole = new VirtualConsole();
+  virtualConsole.on("error", () => {
+  });
+  virtualConsole.on("warn", () => {
+  });
+  virtualConsole.on("info", () => {
+  });
+  virtualConsole.on("debug", () => {
+  });
+  virtualConsole.on("log", () => {
+  });
+  const defaultOptions = {
+    virtualConsole
+  };
+  const finalOptions = { ...defaultOptions, ...options };
+  return new JSDOM(html, finalOptions);
+}
+// src/scraper/middleware/components/HtmlLinkExtractorMiddleware.ts
+var HtmlLinkExtractorMiddleware = class {
+  /**
+   * Processes the context to extract links from the sanitized HTML body.
+   * @param context The current processing context.
+   * @param next Function to call the next middleware.
+   */
+  async process(context, next) {
+    const $ = context.dom;
+    if (!$) {
+      if (context.contentType.startsWith("text/html")) {
+        logger.warn(
+          `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
+        );
+      }
+      await next();
+      return;
+    }
+    try {
+      const linkElements = $("a[href]");
+      logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
+      const extractedLinks = [];
+      linkElements.each((index2, element) => {
+        const href = $(element).attr("href");
+        if (href && href.trim() !== "") {
+          try {
+            const urlObj = new URL(href, context.source);
+            if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
+              logger.debug(`Ignoring link with invalid protocol: ${href}`);
+              return;
+            }
+            extractedLinks.push(urlObj.href);
+          } catch (e) {
+            logger.debug(`Ignoring invalid URL syntax: ${href}`);
+          }
+        }
+      });
+      context.links = [...new Set(extractedLinks)];
+      logger.debug(
+        `Extracted ${context.links.length} unique, valid links from ${context.source}`
+      );
+    } catch (error) {
+      logger.error(`Error extracting links from ${context.source}: ${error}`);
+      context.errors.push(
+        new Error(
+          `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`
+        )
+      );
+    }
+    await next();
+  }
+};
+// src/scraper/middleware/components/HtmlMetadataExtractorMiddleware.ts
+var HtmlMetadataExtractorMiddleware = class {
+  /**
+   * Processes the context to extract the HTML title.
+   * @param context The current processing context.
+   * @param next Function to call the next middleware.
+   */
+  async process(context, next) {
+    const $ = context.dom;
+    if (!$) {
+      if (context.contentType.startsWith("text/html")) {
+        logger.warn(
+          `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware runs before this.`
+        );
+      }
+      await next();
+      return;
+    }
+    try {
+      let title = $("title").first().text().trim();
+      if (!title) {
+        title = $("h1").first().text().trim();
+      }
+      title = title || "Untitled";
+      title = title.replace(/\s+/g, " ").trim();
+      context.metadata.title = title;
+      logger.debug(`Extracted title: "${title}" from ${context.source}`);
+    } catch (error) {
+      logger.error(`Error extracting metadata from ${context.source}: ${error}`);
+      context.errors.push(
+        new Error(
+          `Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`
+        )
+      );
+    }
+    await next();
+  }
+};
+// src/scraper/middleware/components/HtmlPlaywrightMiddleware.ts
+import { chromium } from "playwright";
+var HtmlPlaywrightMiddleware = class {
+  browser = null;
+  /**
+   * Initializes the Playwright browser instance.
+   * Consider making this more robust (e.g., lazy initialization, singleton).
+   */
+  async ensureBrowser() {
+    if (!this.browser || !this.browser.isConnected()) {
+      const launchArgs = process.env.PLAYWRIGHT_LAUNCH_ARGS?.split(" ") ?? [];
+      logger.debug(
+        `Launching new Playwright browser instance (Chromium) with args: ${launchArgs.join(" ") || "none"}...`
+      );
+      this.browser = await chromium.launch({ channel: "chromium", args: launchArgs });
+      this.browser.on("disconnected", () => {
+        logger.debug("Playwright browser instance disconnected.");
+        this.browser = null;
+      });
+    }
+    return this.browser;
+  }
+  /**
+   * Closes the Playwright browser instance if it exists.
+   * Should be called during application shutdown.
+   */
+  async closeBrowser() {
+    if (this.browser?.isConnected()) {
+      logger.debug("Closing Playwright browser instance...");
+      await this.browser.close();
+      this.browser = null;
+    }
+  }
+  async process(context, next) {
+    if (!context.contentType.startsWith("text/html")) {
+      await next();
+      return;
+    }
+    const scrapeMode = context.options?.scrapeMode ?? "auto" /* Auto */;
+    const shouldRunPlaywright = scrapeMode === "playwright" /* Playwright */ || scrapeMode === "auto" /* Auto */;
+    if (!shouldRunPlaywright) {
+      logger.debug(
+        `Skipping Playwright rendering for ${context.source} as scrapeMode is '${scrapeMode}'.`
+      );
+      await next();
+      return;
+    }
+    logger.debug(
+      `Running Playwright rendering for ${context.source} (scrapeMode: '${scrapeMode}')`
+    );
+    let page = null;
+    let renderedHtml = null;
+    try {
+      const browser = await this.ensureBrowser();
+      page = await browser.newPage();
+      logger.debug(`Playwright: Processing ${context.source}`);
+      await page.route("**/*", (route) => {
+        if (route.request().url() === context.source) {
+          return route.fulfill({
+            status: 200,
+            contentType: context.contentType,
+            body: context.content
+          });
+        }
+        const resourceType = route.request().resourceType();
+        if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
+          return route.abort();
+        }
+        return route.continue();
+      });
+      await page.goto(context.source, {
+        waitUntil: "load"
+      });
+      renderedHtml = await page.content();
+      logger.debug(`Playwright: Successfully rendered content for ${context.source}`);
+    } catch (error) {
+      logger.error(`Playwright failed to render ${context.source}: ${error}`);
+      context.errors.push(
+        error instanceof Error ? error : new Error(`Playwright rendering failed: ${String(error)}`)
+      );
+    } finally {
+      if (page) {
+        await page.unroute("**/*");
+        await page.close();
+      }
+    }
+    if (renderedHtml !== null) {
+      context.content = renderedHtml;
+      logger.debug(
+        `Playwright middleware updated content for ${context.source}. Proceeding.`
+      );
+    } else {
+      logger.warn(
+        `Playwright rendering resulted in null content for ${context.source}. Proceeding without content update.`
+      );
+    }
+    await next();
+  }
+};
+// src/scraper/middleware/components/HtmlSanitizerMiddleware.ts
+var HtmlSanitizerMiddleware = class {
+  // Default selectors to remove
+  defaultSelectorsToRemove = [
+    "nav",
+    "footer",
+    "script",
+    "style",
+    "noscript",
+    "svg",
+    "link",
+    "meta",
+    "iframe",
+    "header",
+    "button",
+    "input",
+    "textarea",
+    "select",
+    // "form", // Keep commented
+    ".ads",
+    ".advertisement",
+    ".banner",
+    ".cookie-banner",
+    ".cookie-consent",
+    ".hidden",
+    ".hide",
+    ".modal",
+    ".nav-bar",
+    ".overlay",
+    ".popup",
+    ".promo",
+    ".mw-editsection",
+    ".side-bar",
+    ".social-share",
+    ".sticky",
+    "#ads",
+    "#banner",
+    "#cookieBanner",
+    "#modal",
+    "#nav",
+    "#overlay",
+    "#popup",
+    "#sidebar",
+    "#socialMediaBox",
+    "#stickyHeader",
+    "#ad-container",
+    ".ad-container",
+    ".login-form",
+    ".signup-form",
+    ".tooltip",
+    ".dropdown-menu",
+    // ".alert", // Keep commented
+    ".breadcrumb",
+    ".pagination",
+    // '[role="alert"]', // Keep commented
+    '[role="banner"]',
+    '[role="dialog"]',
+    '[role="alertdialog"]',
+    '[role="region"][aria-label*="skip" i]',
+    '[aria-modal="true"]',
+    ".noprint"
+  ];
+  async process(context, next) {
+    const $ = context.dom;
+    if (!$) {
+      if (context.contentType.startsWith("text/html")) {
+        logger.warn(
+          `Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`
+        );
+      }
+      await next();
+      return;
+    }
+    try {
+      const selectorsToRemove = [
+        ...context.options.excludeSelectors || [],
+        // Use options from the context
+        ...this.defaultSelectorsToRemove
+      ];
+      logger.debug(
+        `Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`
+      );
+      let removedCount = 0;
+      for (const selector of selectorsToRemove) {
+        try {
+          const elements = $(selector);
+          const count = elements.length;
+          if (count > 0) {
+            elements.remove();
+            removedCount += count;
+          }
+        } catch (selectorError) {
+          logger.warn(
+            `Potentially invalid selector "${selector}" during element removal: ${selectorError}`
+          );
+          context.errors.push(
+            new Error(`Invalid selector "${selector}": ${selectorError}`)
+          );
+        }
+      }
+      logger.debug(`Removed ${removedCount} elements for ${context.source}`);
+    } catch (error) {
+      logger.error(`Error during HTML element removal for ${context.source}: ${error}`);
+      context.errors.push(
+        error instanceof Error ? error : new Error(`HTML element removal failed: ${String(error)}`)
+      );
+    }
+    await next();
+  }
+};
+// src/scraper/middleware/components/HtmlToMarkdownMiddleware.ts
+import { gfm } from "@joplin/turndown-plugin-gfm";
+import TurndownService from "turndown";
+var HtmlToMarkdownMiddleware = class {
+  turndownService;
+  constructor() {
+    this.turndownService = new TurndownService({
+      headingStyle: "atx",
+      hr: "---",
+      bulletListMarker: "-",
+      codeBlockStyle: "fenced",
+      emDelimiter: "_",
+      strongDelimiter: "**",
+      linkStyle: "inlined"
+    });
+    this.turndownService.use(gfm);
+    this.addCustomRules();
+  }
+  addCustomRules() {
+    this.turndownService.addRule("pre", {
+      filter: ["pre"],
+      replacement: (content3, node2) => {
+        const element = node2;
+        let language = element.getAttribute("data-language") || "";
+        if (!language) {
+          const highlightElement = element.closest(
+            '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
+          ) || element.querySelector(
+            '[class*="highlight-source-"], [class*="highlight-"], [class*="language-"]'
+          );
+          if (highlightElement) {
+            const className = highlightElement.className;
+            const match = className.match(
+              /(?:highlight-source-|highlight-|language-)(\w+)/
+            );
+            if (match) language = match[1];
+          }
+        }
+        const brElements = element.querySelectorAll("br");
+        if (brElements.length > 0) {
+          for (const br of brElements) {
+            br.replaceWith("\n");
+          }
+        }
+        const text3 = element.textContent || "";
+        return `
+\`\`\`${language}
+${text3.replace(/^\n+|\n+$/g, "")}
+\`\`\`
+`;
+      }
+    });
+  }
+  /**
+   * Processes the context to convert the sanitized HTML body node to Markdown.
+   * @param context The current processing context.
+   * @param next Function to call the next middleware.
+   */
+  async process(context, next) {
+    const $ = context.dom;
+    if (!$) {
+      if (context.contentType.startsWith("text/html")) {
+        logger.warn(
+          `Skipping ${this.constructor.name}: context.dom is missing for HTML content. Ensure HtmlCheerioParserMiddleware ran correctly.`
+        );
+      }
+      await next();
+      return;
+    }
+    try {
+      logger.debug(`Converting HTML content to Markdown for ${context.source}`);
+      const htmlToConvert = $("body").html() || $.html();
+      const markdown = this.turndownService.turndown(htmlToConvert).trim();
+      if (!markdown) {
+        const warnMsg = `HTML to Markdown conversion resulted in empty content for ${context.source}.`;
+        logger.warn(warnMsg);
+        context.content = "";
+        context.contentType = "text/markdown";
+      } else {
+        context.content = markdown;
+        context.contentType = "text/markdown";
+        logger.debug(`Successfully converted HTML to Markdown for ${context.source}`);
+      }
+    } catch (error) {
+      logger.error(`Error converting HTML to Markdown for ${context.source}: ${error}`);
+      context.errors.push(
+        new Error(
+          `Failed to convert HTML to Markdown: ${error instanceof Error ? error.message : String(error)}`
+        )
+      );
+    }
+    await next();
+  }
+};
+// src/scraper/middleware/components/MarkdownLinkExtractorMiddleware.ts
+var MarkdownLinkExtractorMiddleware = class {
+  /**
+   * Processes the context. Currently a no-op regarding link extraction.
+   * @param context The current processing context.
+   * @param next Function to call the next middleware.
+   */
+  async process(context, next) {
+    if (context.contentType === "text/markdown") {
+      if (!Array.isArray(context.links)) {
+        context.links = [];
+      }
+    }
+    await next();
+  }
+};
+// src/scraper/middleware/components/MarkdownMetadataExtractorMiddleware.ts
+var MarkdownMetadataExtractorMiddleware = class {
+  /**
+   * Processes the context to extract the title from Markdown.
+   * @param context The current processing context.
+   * @param next Function to call the next middleware.
+   */
+  async process(context, next) {
+    if (context.contentType === "text/markdown" || context.contentType === "text/plain") {
+      try {
+        const textContent = typeof context.content === "string" ? context.content : Buffer.from(context.content).toString("utf-8");
+        if (typeof context.content !== "string") {
+          context.content = textContent;
+        }
+        let title = "Untitled";
+        if (context.contentType === "text/markdown") {
+          const match = textContent.match(/^#\s+(.*)$/m);
+          if (match?.[1]) {
+            title = match[1].trim();
+          }
+        }
+        context.metadata.title = title;
+      } catch (error) {
+        context.errors.push(
+          new Error(
+            `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`
+          )
+        );
+      }
+    }
+    await next();
+  }
+};
 // src/scraper/strategies/BaseScraperStrategy.ts
 import { URL as URL2 } from "node:url";
@@ -629,8 +966,8 @@ var CancellationError = class extends PipelineError {
 };
 // src/scraper/strategies/BaseScraperStrategy.ts
-var DEFAULT_MAX_PAGES = 100;
-var DEFAULT_MAX_DEPTH = 3;
+var DEFAULT_MAX_PAGES2 = 100;
+var DEFAULT_MAX_DEPTH2 = 3;
 var DEFAULT_CONCURRENCY = 3;
 var BaseScraperStrategy = class {
   visited = /* @__PURE__ */ new Set();
@@ -639,19 +976,14 @@ var BaseScraperStrategy = class {
   constructor(options = {}) {
     this.options = options;
   }
-  getProcessor(mimeType) {
-    if (mimeType.startsWith("text/html")) {
-      return new HtmlProcessor();
-    }
-    return new MarkdownProcessor();
-  }
+  // Removed getProcessor method as processing is now handled by strategies using middleware pipelines
   async processBatch(batch, baseUrl, options, progressCallback, signal) {
     const results = await Promise.all(
       batch.map(async (item) => {
         if (signal?.aborted) {
           throw new CancellationError("Scraping cancelled during batch processing");
         }
-        const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH;
+        const maxDepth = options.maxDepth ?? DEFAULT_MAX_DEPTH2;
         if (item.depth > maxDepth) {
           return [];
         }
@@ -659,7 +991,7 @@ var BaseScraperStrategy = class {
           const result = await this.processItem(item, options, void 0, signal);
           if (result.document) {
             this.pageCount++;
-            const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
+            const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
             logger.info(
               `\u{1F310} Scraping page ${this.pageCount}/${maxPages} (depth ${item.depth}/${maxDepth}): ${item.url}`
             );
@@ -711,7 +1043,7 @@ var BaseScraperStrategy = class {
     const baseUrl = new URL2(options.url);
     const queue = [{ url: options.url, depth: 0 }];
     this.visited.add(normalizeUrl(options.url, this.options.urlNormalizerOptions));
-    const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES;
+    const maxPages = options.maxPages ?? DEFAULT_MAX_PAGES2;
     const maxConcurrency = options.maxConcurrency ?? DEFAULT_CONCURRENCY;
     while (queue.length > 0 && this.pageCount < maxPages) {
       if (signal?.aborted) {
@@ -745,9 +1077,12 @@ var BaseScraperStrategy = class {
 var WebScraperStrategy = class extends BaseScraperStrategy {
   httpFetcher = new HttpFetcher();
   shouldFollowLinkFn;
+  playwrightMiddleware;
+  // Add member
   constructor(options = {}) {
     super({ urlNormalizerOptions: options.urlNormalizerOptions });
     this.shouldFollowLinkFn = options.shouldFollowLink;
+    this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
   }
   canHandle(url) {
     try {
@@ -781,12 +1116,56 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
         followRedirects: options.followRedirects
       };
       const rawContent = await this.httpFetcher.fetch(url, fetchOptions);
-      const processor = this.getProcessor(rawContent.mimeType);
-      const result = await processor.process(rawContent);
+      const initialContext = {
+        content: rawContent.content,
+        contentType: rawContent.mimeType,
+        source: rawContent.source,
+        // Use the final source URL after redirects
+        metadata: {},
+        links: [],
+        errors: [],
+        options,
+        fetcher: this.httpFetcher
+      };
+      let pipeline;
+      if (initialContext.contentType.startsWith("text/html")) {
+        const htmlPipelineSteps = [
+          this.playwrightMiddleware,
+          // Use the instance member
+          // TODO: Add HtmlJsExecutorMiddleware here if needed based on options
+          new HtmlCheerioParserMiddleware(),
+          // Always runs after content is finalized
+          new HtmlMetadataExtractorMiddleware(),
+          new HtmlLinkExtractorMiddleware(),
+          new HtmlSanitizerMiddleware(),
+          // Element remover
+          new HtmlToMarkdownMiddleware()
+        ];
+        pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
+      } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
+        pipeline = new ContentProcessingPipeline([
+          new MarkdownMetadataExtractorMiddleware(),
+          new MarkdownLinkExtractorMiddleware()
+          // Placeholder for now
+        ]);
+      } else {
+        logger.warn(
+          `Unsupported content type "${initialContext.contentType}" for URL ${url}. Skipping processing.`
+        );
+        return { document: void 0, links: [] };
+      }
+      const finalContext = await pipeline.run(initialContext);
+      for (const err of finalContext.errors) {
+        logger.warn(`Processing error for ${url}: ${err.message}`);
+      }
+      if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
+        logger.warn(`No processable content found for ${url} after pipeline execution.`);
+        return { document: void 0, links: finalContext.links };
+      }
       const baseUrl = new URL(options.url);
-      const links = result.links.filter((link) => {
+      const filteredLinks = finalContext.links.filter((link) => {
         try {
-          const targetUrl = new URL(link, baseUrl);
+          const targetUrl = new URL(link);
           const scope = options.scope || "subpages";
           return this.isInScope(baseUrl, targetUrl, scope) && (!this.shouldFollowLinkFn || this.shouldFollowLinkFn(baseUrl, targetUrl));
         } catch {
@@ -795,21 +1174,37 @@ var WebScraperStrategy = class extends BaseScraperStrategy {
       });
       return {
         document: {
-          content: result.content,
+          content: finalContext.content,
+          // Final processed content (Markdown)
           metadata: {
-            url: result.source,
-            title: result.title,
+            url: finalContext.source,
+            // URL after redirects
+            // Ensure title is a string, default to "Untitled"
+            title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
             library: options.library,
             version: options.version
+            // Add other metadata from context if needed
           }
         },
-        links
+        links: filteredLinks
+        // Use the filtered links
       };
     } catch (error) {
-      logger.error(`Failed to scrape page ${url}: ${error}`);
+      logger.error(`Failed processing page ${url}: ${error}`);
       throw error;
     }
   }
+  /**
+   * Overrides the base scrape method to ensure the Playwright browser is closed
+   * after the scraping process completes or errors out.
+   */
+  async scrape(options, progressCallback, signal) {
+    try {
+      await super.scrape(options, progressCallback, signal);
+    } finally {
+      await this.playwrightMiddleware.closeBrowser();
+    }
+  }
 };
 // src/scraper/strategies/GitHubScraperStrategy.ts
@@ -879,18 +1274,58 @@ var LocalFileStrategy = class extends BaseScraperStrategy {
     }
     logger.info(`\u{1F4C4} Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
     const rawContent = await this.fileFetcher.fetch(item.url);
-    const processor = this.getProcessor(rawContent.mimeType);
-    const result = await processor.process(rawContent);
+    const initialContext = {
+      content: rawContent.content,
+      contentType: rawContent.mimeType,
+      source: rawContent.source,
+      // file:// URL
+      metadata: {},
+      links: [],
+      // LocalFileStrategy doesn't extract links from file content itself
+      errors: [],
+      options
+      // Pass the full options object
+    };
+    let pipeline;
+    if (initialContext.contentType.startsWith("text/html")) {
+      pipeline = new ContentProcessingPipeline([
+        new HtmlCheerioParserMiddleware(),
+        new HtmlMetadataExtractorMiddleware(),
+        // No HtmlLinkExtractorMiddleware needed for local files
+        new HtmlSanitizerMiddleware(),
+        new HtmlToMarkdownMiddleware()
+      ]);
+    } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain" || // Treat plain text as markdown
+    initialContext.contentType.startsWith("text/")) {
+      pipeline = new ContentProcessingPipeline([
+        new MarkdownMetadataExtractorMiddleware()
+        // No MarkdownLinkExtractorMiddleware needed for local files
+      ]);
+    } else {
+      logger.warn(
+        `Unsupported content type "${initialContext.contentType}" for file ${filePath}. Skipping processing.`
+      );
+      return { document: void 0, links: [] };
+    }
+    const finalContext = await pipeline.run(initialContext);
+    for (const err of finalContext.errors) {
+      logger.warn(`Processing error for ${filePath}: ${err.message}`);
+    }
+    const finalContentString = typeof finalContext.content === "string" ? finalContext.content : Buffer.from(finalContext.content).toString("utf-8");
     return {
       document: {
-        content: result.content,
+        // Use the potentially empty string content
+        content: finalContentString,
         metadata: {
-          url: item.url,
-          title: result.title,
+          url: finalContext.source,
+          // Use context source (file:// URL)
+          // Ensure title is a string, default to "Untitled"
+          title: typeof finalContext.metadata.title === "string" ? finalContext.metadata.title : "Untitled",
           library: options.library,
           version: options.version
         }
       }
+      // No links returned from file content processing
     };
   }
   async scrape(options, progressCallback, signal) {
@@ -1003,7 +1438,7 @@ var PipelineWorker = class {
   async executeJob(job, callbacks) {
     const { id: jobId, library, version, options, abortController } = job;
     const signal = abortController.signal;
-    logger.info(`[${jobId}] Worker starting job for ${library}@${version}`);
+    logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
     try {
       await this.scraperService.scrape(
         options,
@@ -1323,14 +1758,13 @@ var LibraryNotFoundError = class extends ToolError {
 // src/tools/FetchUrlTool.ts
 var FetchUrlTool = class {
-  constructor(httpFetcher, fileFetcher, processor) {
-    this.processor = processor;
-    this.fetchers = [httpFetcher, fileFetcher];
-  }
   /**
    * Collection of fetchers that will be tried in order for a given URL.
    */
   fetchers;
+  constructor(httpFetcher, fileFetcher) {
+    this.fetchers = [httpFetcher, fileFetcher];
+  }
   /**
    * Fetches content from a URL and converts it to Markdown.
    * Supports both HTTP/HTTPS URLs and local file URLs (file://).
@@ -1338,7 +1772,7 @@ var FetchUrlTool = class {
    * @throws {ToolError} If fetching or processing fails
    */
   async execute(options) {
-    const { url } = options;
+    const { url, scrapeMode = "auto" /* Auto */ } = options;
     const canFetchResults = this.fetchers.map((f) => f.canFetch(url));
     const fetcherIndex = canFetchResults.findIndex((result) => result === true);
     if (fetcherIndex === -1) {
@@ -1348,18 +1782,88 @@ var FetchUrlTool = class {
       );
     }
     const fetcher = this.fetchers[fetcherIndex];
+    const playwrightMiddleware = new HtmlPlaywrightMiddleware();
     try {
       logger.info(`\u{1F4E1} Fetching ${url}...`);
       const rawContent = await fetcher.fetch(url, {
         followRedirects: options.followRedirects ?? true,
         maxRetries: 3
+        // Keep retries for fetching
       });
-      logger.info("\u{1F504} Converting to Markdown...");
-      const processed = await this.processor.process(rawContent);
-      logger.info(`\u2705 Successfully converted ${url} to Markdown`);
-      return processed.content;
+      logger.info("\u{1F504} Processing content...");
+      const initialContext = {
+        content: rawContent.content,
+        contentType: rawContent.mimeType,
+        source: rawContent.source,
+        metadata: {},
+        links: [],
+        // Links not needed for this tool's output
+        errors: [],
+        fetcher,
+        // Create a minimal ScraperOptions object for the context
+        options: {
+          url,
+          // Use the input URL
+          library: "",
+          // Not applicable for this tool
+          version: "",
+          // Use empty string instead of undefined
+          // Default other options as needed by middleware
+          maxDepth: 0,
+          maxPages: 1,
+          maxConcurrency: 1,
+          scope: "subpages",
+          // Default, though not used for single page fetch
+          followRedirects: options.followRedirects ?? true,
+          excludeSelectors: void 0,
+          // Not currently configurable via this tool
+          ignoreErrors: false,
+          scrapeMode
+          // Pass the scrapeMode
+        }
+      };
+      let pipeline;
+      if (initialContext.contentType.startsWith("text/html")) {
+        const htmlPipelineSteps = [
+          playwrightMiddleware,
+          // Use the instantiated middleware
+          new HtmlCheerioParserMiddleware(),
+          // Always runs after content is finalized
+          new HtmlMetadataExtractorMiddleware(),
+          // Keep for potential future use
+          // No Link Extractor needed for this tool
+          new HtmlSanitizerMiddleware(),
+          // Element remover
+          new HtmlToMarkdownMiddleware()
+        ];
+        pipeline = new ContentProcessingPipeline(htmlPipelineSteps);
+      } else if (initialContext.contentType === "text/markdown" || initialContext.contentType === "text/plain") {
+        pipeline = new ContentProcessingPipeline([
+          new MarkdownMetadataExtractorMiddleware()
+          // Extract title (though not used)
+          // No further processing needed for Markdown/Plain text for this tool
+        ]);
+      } else {
+        logger.warn(
+          `Unsupported content type "${initialContext.contentType}" for ${url}. Returning raw content.`
+        );
+        const contentString = typeof rawContent.content === "string" ? rawContent.content : Buffer.from(rawContent.content).toString("utf-8");
+        return contentString;
+      }
+      const finalContext = await pipeline.run(initialContext);
+      for (const err of finalContext.errors) {
+        logger.warn(`Processing error for ${url}: ${err.message}`);
+      }
+      if (typeof finalContext.content !== "string" || !finalContext.content.trim()) {
+        throw new ToolError(
+          `Processing resulted in empty content for ${url}`,
+          this.constructor.name
+        );
+      }
+      logger.info(`\u2705 Successfully processed ${url}`);
+      return finalContext.content;
     } catch (error) {
-      if (error instanceof ScraperError) {
+      if (error instanceof ScraperError || error instanceof ToolError) {
         throw new ToolError(
           `Failed to fetch or process URL: ${error.message}`,
           this.constructor.name
@@ -1369,6 +1873,8 @@ var FetchUrlTool = class {
         `Failed to fetch or process URL: ${error instanceof Error ? error.message : String(error)}`,
         this.constructor.name
       );
+    } finally {
+      await playwrightMiddleware.closeBrowser();
     }
   }
 };
@@ -1489,10 +1995,12 @@ var ScrapeTool = class {
       version: internalVersion,
       scope: scraperOptions?.scope ?? "subpages",
       followRedirects: scraperOptions?.followRedirects ?? true,
-      maxPages: scraperOptions?.maxPages ?? 100,
-      maxDepth: scraperOptions?.maxDepth ?? 3,
-      // maxConcurrency is handled by the manager itself now
-      ignoreErrors: scraperOptions?.ignoreErrors ?? true
+      maxPages: scraperOptions?.maxPages ?? DEFAULT_MAX_PAGES,
+      maxDepth: scraperOptions?.maxDepth ?? DEFAULT_MAX_DEPTH,
+      maxConcurrency: scraperOptions?.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY,
+      ignoreErrors: scraperOptions?.ignoreErrors ?? true,
+      scrapeMode: scraperOptions?.scrapeMode ?? "auto" /* Auto */
+      // Pass scrapeMode enum
     });
     logger.info(`\u{1F680} Job ${jobId} enqueued for scraping.`);
     options.onProgress?.({
@@ -1780,7 +2288,6 @@ import Fuse from "fuse.js";
 import semver3 from "semver";
 // src/splitter/SemanticMarkdownSplitter.ts
-import { JSDOM as JSDOM2 } from "jsdom";
 import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter";
 import remarkGfm from "remark-gfm";
 import remarkHtml from "remark-html";
@@ -10597,7 +11104,7 @@ ${"```"}`;
    * Parse HTML
    */
   async parseHtml(html) {
-    const { window } = new JSDOM2(html);
+    const { window } = createJSDOM(html);
     return window.document;
   }
 };
@@ -11566,11 +12073,14 @@ var DocumentManagementService = class {
 };
 export {
+  DEFAULT_MAX_PAGES,
+  DEFAULT_MAX_DEPTH,
+  DEFAULT_MAX_CONCURRENCY,
   setLogLevel,
   logger,
   HttpFetcher,
   FileFetcher,
-  HtmlProcessor,
+  ScrapeMode,
   PipelineJobStatus,
   PipelineManager,
   CancelJobTool,
@@ -11585,4 +12095,4 @@ export {
   SearchTool,
   DocumentManagementService
 };
-//# sourceMappingURL=chunk-ADZQJG2M.js.map
+//# sourceMappingURL=chunk-VTO2ED43.js.map