npm - @arabold/docs-mcp-server - Versions diffs - 1.34.0 → 1.35.0 - Mend

@arabold/docs-mcp-server 1.34.0 → 1.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -33,6 +33,7 @@ import mime from "mime";
 import { HeaderGenerator } from "header-generator";
 import fs$1 from "node:fs/promises";
 import axios from "axios";
+import { MarkItDown } from "markitdown-ts";
 import { VirtualConsole, JSDOM } from "jsdom";
 import psl from "psl";
 import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
@@ -945,6 +946,10 @@ const DEFAULT_CONFIG = {
     childLimit: 3,
     precedingSiblingsLimit: 1,
     subsequentSiblingsLimit: 2
+  },
+  document: {
+    maxSize: 10 * 1024 * 1024
+    // 10MB max size for PDF/Office documents
   }
 };
 const AppConfigSchema = z.object({
@@ -1018,7 +1023,10 @@ const AppConfigSchema = z.object({
     childLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.childLimit),
     precedingSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.precedingSiblingsLimit),
     subsequentSiblingsLimit: z.coerce.number().int().default(DEFAULT_CONFIG.assembly.subsequentSiblingsLimit)
-  }).default(DEFAULT_CONFIG.assembly)
+  }).default(DEFAULT_CONFIG.assembly),
+  document: z.object({
+    maxSize: z.coerce.number().int().default(DEFAULT_CONFIG.document.maxSize)
+  }).default(DEFAULT_CONFIG.document)
 });
 const defaults = AppConfigSchema.parse({});
 const configMappings = [
@@ -2370,6 +2378,31 @@ class MimeTypeUtils {
   static isJson(mimeType) {
     return mimeType === "application/json" || mimeType === "text/json" || mimeType === "text/x-json";
   }
+  /**
+   * Checks if a MIME type represents PDF content.
+   */
+  static isPdf(mimeType) {
+    return mimeType === "application/pdf";
+  }
+  /**
+   * Checks if a MIME type represents an Office document (DOCX, XLSX, PPTX).
+   */
+  static isOfficeDocument(mimeType) {
+    return mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation";
+  }
+  /**
+   * Checks if a MIME type represents a Jupyter Notebook.
+   */
+  static isJupyterNotebook(mimeType) {
+    return mimeType === "application/x-ipynb+json";
+  }
+  /**
+   * Checks if a MIME type represents a document that can be processed
+   * by the DocumentPipeline (PDF, Office docs, Jupyter notebooks).
+   */
+  static isSupportedDocument(mimeType) {
+    return MimeTypeUtils.isPdf(mimeType) || MimeTypeUtils.isOfficeDocument(mimeType) || MimeTypeUtils.isJupyterNotebook(mimeType);
+  }
   /**
    * Checks if a MIME type represents source code that should be wrapped in code blocks.
    */
@@ -3098,7 +3131,9 @@ function normalizeUrl(url, options = defaultNormalizerOptions) {
   try {
     const parsedUrl = new URL(url);
     const finalOptions = { ...defaultNormalizerOptions, ...options };
-    const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
+    const normalized = new URL(url);
+    normalized.search = "";
+    normalized.hash = "";
     if (finalOptions.removeIndex) {
       normalized.pathname = normalized.pathname.replace(
         /\/index\.(html|htm|asp|php|jsp)$/i,
@@ -3110,13 +3145,13 @@ function normalizeUrl(url, options = defaultNormalizerOptions) {
     }
     const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
     const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
-    let result = normalized.origin + normalized.pathname;
-    if (preservedSearch) {
-      result += preservedSearch;
+    if (!finalOptions.removeQuery) {
+      normalized.search = preservedSearch;
     }
-    if (preservedHash) {
-      result += preservedHash;
+    if (!finalOptions.removeHash) {
+      normalized.hash = preservedHash;
     }
+    let result = normalized.href;
     if (finalOptions.ignoreCase) {
       result = result.toLowerCase();
     }
@@ -3790,6 +3825,181 @@ ${"```"}`;
     return window2.document;
   }
 }
+class BasePipeline {
+  /**
+   * Determines if this pipeline can process content with the given MIME type.
+   * Must be implemented by derived classes.
+   */
+  canProcess(_mimeType, _content) {
+    throw new Error("Method not implemented.");
+  }
+  /**
+   * Processes the raw content through the pipeline.
+   * Must be implemented by derived classes.
+   */
+  async process(_rawContent, _options, _fetcher) {
+    throw new Error("Method not implemented.");
+  }
+  /**
+   * Cleanup resources used by this pipeline.
+   * Default implementation does nothing - override in derived classes as needed.
+   */
+  async close() {
+  }
+  /**
+   * Executes a middleware stack on the given context.
+   * This is a utility method used by derived pipeline classes.
+   *
+   * @param middleware - The middleware stack to execute
+   * @param context - The context to process
+   */
+  async executeMiddlewareStack(middleware, context) {
+    let index = -1;
+    const dispatch = async (i) => {
+      if (i <= index) throw new Error("next() called multiple times");
+      index = i;
+      const mw = middleware[i];
+      if (!mw) return;
+      await mw.process(context, dispatch.bind(null, i + 1));
+    };
+    try {
+      await dispatch(0);
+    } catch (error) {
+      context.errors.push(error instanceof Error ? error : new Error(String(error)));
+    }
+  }
+}
+class DocumentPipeline extends BasePipeline {
+  markitdown;
+  splitter;
+  maxSize;
+  constructor(config) {
+    super();
+    this.markitdown = new MarkItDown();
+    this.maxSize = config.document.maxSize;
+    const semanticSplitter = new SemanticMarkdownSplitter(
+      config.splitter.preferredChunkSize,
+      config.splitter.maxChunkSize
+    );
+    this.splitter = new GreedySplitter(
+      semanticSplitter,
+      config.splitter.minChunkSize,
+      config.splitter.preferredChunkSize,
+      config.splitter.maxChunkSize
+    );
+  }
+  canProcess(mimeType) {
+    return MimeTypeUtils.isSupportedDocument(mimeType);
+  }
+  async process(rawContent, _options) {
+    const buffer = Buffer.isBuffer(rawContent.content) ? rawContent.content : Buffer.from(rawContent.content);
+    if (buffer.length > this.maxSize) {
+      logger.warn(
+        `Document exceeds size limit (${buffer.length} > ${this.maxSize}): ${rawContent.source}`
+      );
+      return {
+        title: null,
+        contentType: rawContent.mimeType,
+        textContent: null,
+        links: [],
+        errors: [new Error(`Document exceeds maximum size of ${this.maxSize} bytes`)],
+        chunks: []
+      };
+    }
+    const extension = this.extractExtension(rawContent.source);
+    if (!extension) {
+      logger.warn(`Could not determine file extension: ${rawContent.source}`);
+      return {
+        title: null,
+        contentType: rawContent.mimeType,
+        textContent: null,
+        links: [],
+        errors: [new Error("Could not determine file extension for document")],
+        chunks: []
+      };
+    }
+    try {
+      const result = await this.markitdown.convertBuffer(buffer, {
+        file_extension: `.${extension}`
+      });
+      if (!result?.markdown) {
+        logger.warn(`No content extracted from document: ${rawContent.source}`);
+        return {
+          title: null,
+          contentType: rawContent.mimeType,
+          textContent: null,
+          links: [],
+          errors: [],
+          chunks: []
+        };
+      }
+      const title = result.title || this.extractFilename(rawContent.source);
+      let markdown = result.markdown;
+      if (extension === "xlsx") {
+        markdown = this.promoteTableHeaders(markdown);
+      }
+      const chunks = await this.splitter.splitText(markdown, "text/markdown");
+      return {
+        title,
+        contentType: "text/markdown",
+        // Output is always markdown
+        textContent: markdown,
+        links: [],
+        // Documents don't have extractable links
+        errors: [],
+        chunks
+      };
+    } catch (error) {
+      const errorName = error instanceof Error ? error.name : "UnknownError";
+      const safeMessage = `Failed to convert document: ${errorName}`;
+      logger.warn(`${safeMessage} for ${rawContent.source}`);
+      return {
+        title: null,
+        contentType: rawContent.mimeType,
+        textContent: null,
+        links: [],
+        errors: [new Error(safeMessage)],
+        chunks: []
+      };
+    }
+  }
+  extractExtension(source) {
+    try {
+      const url = new URL(source);
+      return this.getExtensionFromPath(url.pathname);
+    } catch {
+      return this.getExtensionFromPath(source);
+    }
+  }
+  getExtensionFromPath(pathStr) {
+    const lastSlash = pathStr.lastIndexOf("/");
+    const filename = lastSlash >= 0 ? pathStr.substring(lastSlash + 1) : pathStr;
+    const lastDot = filename.lastIndexOf(".");
+    if (lastDot > 0) {
+      return filename.substring(lastDot + 1).toLowerCase();
+    }
+    return null;
+  }
+  /**
+   * Post-processes Markdown to fix empty table headers generated by sheet-to-html conversions.
+   * Detects tables where the header row is empty and promotes the first data row to be the header.
+   */
+  promoteTableHeaders(markdown) {
+    const emptyHeaderPattern = /^\|(?:\s*\|)+\s*$\r?\n^(\|(?:\s*:?-+:?\s*\|)+)\s*$\r?\n^(\|.*\|)\s*$/gm;
+    return markdown.replace(emptyHeaderPattern, "$2\n$1");
+  }
+  extractFilename(source) {
+    try {
+      const url = new URL(source);
+      const pathname = url.pathname;
+      const lastSlash = pathname.lastIndexOf("/");
+      return pathname.substring(lastSlash + 1) || null;
+    } catch {
+      const lastSlash = source.lastIndexOf("/");
+      return source.substring(lastSlash + 1) || null;
+    }
+  }
+}
 class HtmlCheerioParserMiddleware {
   async process(context, next) {
     try {
@@ -5194,50 +5404,6 @@ function convertToString(content, charset) {
     }
   }
 }
-class BasePipeline {
-  /**
-   * Determines if this pipeline can process content with the given MIME type.
-   * Must be implemented by derived classes.
-   */
-  canProcess(_mimeType, _content) {
-    throw new Error("Method not implemented.");
-  }
-  /**
-   * Processes the raw content through the pipeline.
-   * Must be implemented by derived classes.
-   */
-  async process(_rawContent, _options, _fetcher) {
-    throw new Error("Method not implemented.");
-  }
-  /**
-   * Cleanup resources used by this pipeline.
-   * Default implementation does nothing - override in derived classes as needed.
-   */
-  async close() {
-  }
-  /**
-   * Executes a middleware stack on the given context.
-   * This is a utility method used by derived pipeline classes.
-   *
-   * @param middleware - The middleware stack to execute
-   * @param context - The context to process
-   */
-  async executeMiddlewareStack(middleware, context) {
-    let index = -1;
-    const dispatch = async (i) => {
-      if (i <= index) throw new Error("next() called multiple times");
-      index = i;
-      const mw = middleware[i];
-      if (!mw) return;
-      await mw.process(context, dispatch.bind(null, i + 1));
-    };
-    try {
-      await dispatch(0);
-    } catch (error) {
-      context.errors.push(error instanceof Error ? error : new Error(String(error)));
-    }
-  }
-}
 class HtmlPipeline extends BasePipeline {
   playwrightMiddleware;
   standardMiddleware;
@@ -7067,7 +7233,7 @@ class TextPipeline extends BasePipeline {
 let PipelineFactory$1 = class PipelineFactory {
   /**
    * Creates the standard set of content pipelines used by all scraper strategies.
-   * Includes HTML, Markdown, JSON, source code, and text processing capabilities.
+   * Includes HTML, Markdown, JSON, source code, document, and text processing capabilities.
    * Each pipeline now handles both preprocessing and content-specific splitting.
    * TextPipeline is placed last as the universal fallback for unknown content types.
    *
@@ -7077,6 +7243,8 @@ let PipelineFactory$1 = class PipelineFactory {
     return [
       new JsonPipeline(appConfig),
       new SourceCodePipeline(appConfig),
+      new DocumentPipeline(appConfig),
+      // PDF, Office docs, Jupyter notebooks
       new HtmlPipeline(appConfig),
       new MarkdownPipeline(appConfig),
       new TextPipeline(appConfig)
@@ -11246,7 +11414,7 @@ const Layout = ({
   children,
   eventClientConfig
 }) => {
-  const versionString = version || "1.34.0";
+  const versionString = version || "1.35.0";
   const versionInitializer = `versionUpdate({ currentVersion: ${`'${versionString}'`} })`;
   return /* @__PURE__ */ jsxs("html", { lang: "en", children: [
     /* @__PURE__ */ jsxs("head", { children: [
@@ -12692,7 +12860,8 @@ function registerNewJobRoutes(server, scrapeTool, scraperConfig) {
       reply.type("text/html");
       try {
         let parsePatterns = function(input) {
-          if (!input) return void 0;
+          if (input === void 0) return void 0;
+          if (input.trim() === "") return [];
           return input.split(/\n|,/).map((s) => s.trim()).filter((s) => s.length > 0);
         }, parseHeaders2 = function(input) {
           if (!input) return void 0;
@@ -13044,7 +13213,7 @@ const LibrarySearchCard = ({ library }) => {
   ] });
 };
 const SearchResultItem = async ({ result }) => {
-  const isMarkdown = result.mimeType ? MimeTypeUtils.isMarkdown(result.mimeType) : true;
+  const isMarkdown = result.mimeType ? MimeTypeUtils.isMarkdown(result.mimeType) || MimeTypeUtils.isSupportedDocument(result.mimeType) : true;
   const jsdom = createJSDOM("");
   const purifier = DOMPurify(jsdom.window);
   let contentElement;
@@ -13586,7 +13755,7 @@ class AppServer {
       try {
         if (telemetry.isEnabled()) {
           telemetry.setGlobalContext({
-            appVersion: "1.34.0",
+            appVersion: "1.35.0",
             appPlatform: process.platform,
             appNodeVersion: process.version,
             appServicesEnabled: this.getActiveServicesList(),
@@ -14810,8 +14979,12 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
       ".tsv",
       ".log"
     ];
+    const documentExtensions = [".pdf", ".docx", ".xlsx", ".pptx", ".ipynb"];
     const pathLower = path2.toLowerCase();
     const hasTextExtension = textExtensions.some((ext) => pathLower.endsWith(ext));
+    const hasDocumentExtension = documentExtensions.some(
+      (ext) => pathLower.endsWith(ext)
+    );
     const hasCompoundExtension = pathLower.includes(".env.") || pathLower.endsWith(".env") || pathLower.includes(".config.") || pathLower.includes(".lock");
     const fileName = path2.split("/").pop() || "";
     const fileNameLower = fileName.toLowerCase();
@@ -14845,7 +15018,7 @@ class GitHubScraperStrategy extends BaseScraperStrategy {
       }
       return fileNameLower === name || fileNameLower.startsWith(`${name}.`);
     });
-    if (hasTextExtension || hasCompoundExtension || isCommonTextFile) {
+    if (hasTextExtension || hasDocumentExtension || hasCompoundExtension || isCommonTextFile) {
       return shouldIncludeUrl(path2, options.includePatterns, options.excludePatterns);
     }
     const mimeType = mime.getType(path2);
@@ -14982,7 +15155,23 @@ class LocalFileStrategy extends BaseScraperStrategy {
     }
     if (stats.isDirectory()) {
       const contents = await fs$1.readdir(filePath);
-      const links = contents.map((name) => `file://${path.join(filePath, name)}`).filter((url) => this.shouldProcessUrl(url, options));
+      const links = contents.map((name) => {
+        const url = new URL(`file://${path.join(filePath, name)}`);
+        if (url.hostname !== "") {
+          url.pathname = `/${url.hostname}${url.pathname}`;
+          url.hostname = "";
+        }
+        return url.href;
+      }).filter((url) => {
+        const allowed = this.shouldProcessUrl(url, options);
+        if (!allowed) {
+          logger.debug(`Skipping out-of-scope link: ${url}`);
+        }
+        return allowed;
+      });
+      logger.debug(
+        `Found ${links.length} files in ${filePath} (from ${contents.length} entries)`
+      );
       return { url: item.url, links, status: FetchStatus.SUCCESS };
     }
     const rawContent = await this.fileFetcher.fetch(item.url, {
@@ -17216,7 +17405,7 @@ function createCli(argv) {
   let globalEventBus = null;
   let globalTelemetryService = null;
   const commandStartTimes = /* @__PURE__ */ new Map();
-  const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("1.34.0").option("verbose", {
+  const cli = yargs(hideBin(argv)).scriptName("docs-mcp-server").strict().usage("Usage: $0 <command> [options]").version("1.35.0").option("verbose", {
     type: "boolean",
     description: "Enable verbose (debug) logging",
     default: false
@@ -17272,7 +17461,7 @@ function createCli(argv) {
     if (shouldEnableTelemetry() && telemetry.isEnabled()) {
       const commandName = argv2._[0]?.toString() || "default";
       telemetry.setGlobalContext({
-        appVersion: "1.34.0",
+        appVersion: "1.35.0",
         appPlatform: process.platform,
         appNodeVersion: process.version,
         appInterface: "cli",