npm - @ebowwa/markdown-docs-scraper - Versions diffs - 1.0.0 → 1.1.0 - Mend

@ebowwa/markdown-docs-scraper 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -52,6 +52,26 @@ Options:
   -o, --output <dir>       Output directory (default: "./docs")
   --docs-path <path>       Docs path (default: "/docs/en")
   -c, --concurrency <num>  Concurrency level (default: "5")
+  --llms-paths <paths>     Comma-separated llms.txt paths (default: "/llms.txt,/docs/llms.txt")
+  --no-subdomain           Disable docs/doc subdomain fallback
+```
+### llms.txt Discovery
+The scraper automatically tries multiple paths to find `llms.txt`:
+1. **Configured paths** (default: `/llms.txt`, `/docs/llms.txt`)
+2. **Docs subdomain** (e.g., `https://docs.example.com/llms.txt`)
+3. **Doc subdomain** (e.g., `https://doc.example.com/llms.txt`)
+Example with custom paths:
+```bash
+markdown-docs-scraper scrape -u https://example.com --llms-paths "/llms.txt,/api/llms.txt"
+```
+Disable subdomain fallback:
+```bash
+markdown-docs-scraper scrape -u https://example.com --no-subdomain
 ```
 ## Programmatic Usage
@@ -103,6 +123,8 @@ interface ScraperOptions {
   outputDir?: string;        // Output directory (default: "./docs")
   concurrency?: number;      // Concurrent downloads (default: 5)
   onProgress?: (current: number, total: number) => void;
+  llmsPaths?: string[];      // llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"])
+  tryDocsSubdomain?: boolean; // Also try docs/doc subdomains (default: true)
 }
 ```

package/dist/cli.js CHANGED Viewed

@@ -20,7 +20,7 @@ var __toESM = (mod, isNodeMode, target) => {
 var __commonJS = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
 var __require = /* @__PURE__ */ createRequire(import.meta.url);
-// node_modules/commander/lib/error.js
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/error.js
 var require_error = __commonJS((exports) => {
   class CommanderError extends Error {
     constructor(exitCode, code, message) {
@@ -44,7 +44,7 @@ var require_error = __commonJS((exports) => {
   exports.InvalidArgumentError = InvalidArgumentError;
 });
-// node_modules/commander/lib/argument.js
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/argument.js
 var require_argument = __commonJS((exports) => {
   var { InvalidArgumentError } = require_error();
@@ -123,7 +123,7 @@ var require_argument = __commonJS((exports) => {
   exports.humanReadableArgName = humanReadableArgName;
 });
-// node_modules/commander/lib/help.js
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/help.js
 var require_help = __commonJS((exports) => {
   var { humanReadableArgName } = require_argument();
@@ -372,7 +372,7 @@ var require_help = __commonJS((exports) => {
   exports.Help = Help;
 });
-// node_modules/commander/lib/option.js
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/option.js
 var require_option = __commonJS((exports) => {
   var { InvalidArgumentError } = require_error();
@@ -523,7 +523,7 @@ var require_option = __commonJS((exports) => {
   exports.DualOptions = DualOptions;
 });
-// node_modules/commander/lib/suggestSimilar.js
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/suggestSimilar.js
 var require_suggestSimilar = __commonJS((exports) => {
   var maxDistance = 3;
   function editDistance(a, b) {
@@ -596,7 +596,7 @@ var require_suggestSimilar = __commonJS((exports) => {
   exports.suggestSimilar = suggestSimilar;
 });
-// node_modules/commander/lib/command.js
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/command.js
 var require_command = __commonJS((exports) => {
   var EventEmitter = __require("node:events").EventEmitter;
   var childProcess = __require("node:child_process");
@@ -1839,7 +1839,7 @@ Expecting one of '${allowedValues.join("', '")}'`);
   exports.Command = Command;
 });
-// node_modules/commander/index.js
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/index.js
 var require_commander = __commonJS((exports) => {
   var { Argument } = require_argument();
   var { Command } = require_command();
@@ -2377,7 +2377,7 @@ Downloaded: `).concat(new Date().toISOString(), `
   exports.default = MarkdownDocsScraper;
 });
-// node_modules/commander/esm.mjs
+// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/esm.mjs
 var import__ = __toESM(require_commander(), 1);
 var {
   program,
@@ -2396,12 +2396,14 @@ var {
 // src/cli.ts
 var import__2 = __toESM(require_src(), 1);
 program.name("markdown-docs-scraper").description("Scrape and mirror markdown-based documentation sites").version("1.0.0");
-program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).action(async (options) => {
+program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
   const scraperOptions = {
     baseUrl: options.url,
     docsPath: options.docsPath,
     outputDir: options.output,
-    concurrency: parseInt(options.concurrency)
+    concurrency: parseInt(options.concurrency),
+    llmsPaths: options.llmsPaths.split(","),
+    tryDocsSubdomain: !options.noSubdomain
   };
   console.log(`\uD83D\uDD0D Scraping ${options.url}...`);
   console.log(`\uD83D\uDCC1 Output: ${options.output}`);
@@ -2423,10 +2425,12 @@ program.command("scrape").description("Scrape documentation from a URL").require
     }
   }
 });
-program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").action(async (options) => {
+program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
   const scraper = new import__2.MarkdownDocsScraper({
     baseUrl: options.url,
-    docsPath: options.docsPath
+    docsPath: options.docsPath,
+    llmsPaths: options.llmsPaths.split(","),
+    tryDocsSubdomain: !options.noSubdomain
   });
   console.log(`\uD83D\uDD0D Discovering pages from ${options.url}...`);
   const pages = await scraper.discoverPages();
@@ -2434,7 +2438,8 @@ program.command("discover").description("Discover all available documentation pa
 Found ${pages.length} pages:
 `);
   pages.forEach((page) => {
-    console.log(`  - ${page}`);
+    const path = page.category ? `${page.category}/${page.page}` : page.page;
+    console.log(`  - ${path}`);
   });
 });
 program.command("anthropic").description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)").option("-o, --output <dir>", "Output directory", "./docs").action(async (options) => {

package/dist/index.js CHANGED Viewed

@@ -18,6 +18,43 @@ var __toESM = (mod, isNodeMode, target) => {
 var __require = /* @__PURE__ */ createRequire(import.meta.url);
 // src/index.ts
+var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
+function extractTitle(markdown) {
+  const titleMatch = markdown.match(/^#\s+(.+)$/m);
+  return titleMatch ? titleMatch[1].trim() : "Untitled";
+}
+function parsePagePath(pagePath) {
+  const pageName = pagePath.replace(".md", "");
+  const pathParts = pageName.split("/");
+  if (pathParts.length === 1) {
+    return { category: "", page: pathParts[0] };
+  } else if (pathParts.length === 2) {
+    return { category: pathParts[0], page: pathParts[1] };
+  } else {
+    return {
+      category: pathParts.slice(0, -1).join("/"),
+      page: pathParts[pathParts.length - 1]
+    };
+  }
+}
+async function fetchMarkdown(url, userAgent = "@ebowwa/markdown-docs-scraper") {
+  try {
+    const response = await fetch(url, {
+      headers: {
+        Accept: "text/markdown, text/plain",
+        "User-Agent": userAgent
+      }
+    });
+    if (!response.ok) {
+      return null;
+    }
+    return await response.text();
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error);
+    return null;
+  }
+}
 class MarkdownDocsScraper {
   options;
   constructor(options) {
@@ -27,56 +64,99 @@ class MarkdownDocsScraper {
       categories: options.categories || {},
       outputDir: options.outputDir || "./docs",
       concurrency: options.concurrency || 5,
-      onProgress: options.onProgress || (() => {})
+      onProgress: options.onProgress || (() => {}),
+      llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
+      tryDocsSubdomain: options.tryDocsSubdomain ?? true,
+      linkPattern: options.linkPattern || GENERIC_LINK_PATTERN
     };
   }
-  async fetchMarkdown(url) {
-    try {
-      const response = await fetch(url, {
-        headers: {
-          Accept: "text/markdown, text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper"
-        }
-      });
-      if (!response.ok) {
-        return null;
-      }
-      const contentType = response.headers.get("content-type") || "";
-      if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {}
-      return await response.text();
-    } catch (error) {
-      console.error(`Error fetching ${url}:`, error);
-      return null;
-    }
-  }
-  extractTitle(markdown) {
-    const titleMatch = markdown.match(/^#\s+(.+)$/m);
-    return titleMatch ? titleMatch[1].trim() : "Untitled";
-  }
-  sanitizeFilename(path) {
-    return path.toLowerCase().replace(/[^a-z0-9/]+/g, "-").replace(/^-|-$/g, "").replace(/\//g, "/");
-  }
   buildUrl(category, page) {
     if (category) {
       return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
-    } else {
+    } else if (this.options.docsPath) {
       return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
+    } else {
+      return `${this.options.baseUrl}/${page}.md`;
     }
   }
   async downloadPage(category, page) {
     const url = this.buildUrl(category, page);
-    const content = await this.fetchMarkdown(url);
+    const content = await fetchMarkdown(url);
     if (!content) {
       return null;
     }
     return {
       url,
-      title: this.extractTitle(content),
+      title: extractTitle(content),
       content,
       category,
       pageName: page
     };
   }
+  getLlmsUrls() {
+    const urls = [];
+    const baseUrl = this.options.baseUrl;
+    for (const path of this.options.llmsPaths) {
+      urls.push(`${baseUrl}${path}`);
+    }
+    if (this.options.tryDocsSubdomain) {
+      try {
+        const url = new URL(baseUrl);
+        const hostname = url.hostname;
+        if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
+          const docsDomain = hostname.replace(/^www\./, "");
+          urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
+          urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
+        }
+      } catch {}
+    }
+    return urls;
+  }
+  async fetchLlmsTxt() {
+    const urls = this.getLlmsUrls();
+    for (const llmsUrl of urls) {
+      try {
+        const response = await fetch(llmsUrl, {
+          headers: {
+            Accept: "text/plain",
+            "User-Agent": "@ebowwa/markdown-docs-scraper"
+          }
+        });
+        if (response.ok) {
+          const content = await response.text();
+          console.log(`Found llms.txt at ${llmsUrl}`);
+          return { content, url: llmsUrl };
+        }
+      } catch (error) {
+        continue;
+      }
+    }
+    return null;
+  }
+  async discoverPages() {
+    const pages = [];
+    try {
+      const llmsResult = await this.fetchLlmsTxt();
+      if (!llmsResult) {
+        const attemptedUrls = this.getLlmsUrls();
+        console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
+        return pages;
+      }
+      const { content } = llmsResult;
+      const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
+      let match;
+      while ((match = regex.exec(content)) !== null) {
+        const url = match[2];
+        const pagePath = match[3];
+        const { category, page } = parsePagePath(pagePath);
+        pages.push({ category, page });
+      }
+      console.log(`Discovered ${pages.length} pages from llms.txt`);
+    } catch (error) {
+      console.error("Error discovering pages:", error);
+    }
+    return pages;
+  }
   async scrapeFromLlms() {
     const startTime = Date.now();
     const downloaded = [];
@@ -164,72 +244,6 @@ Downloaded: ${new Date().toISOString()}
     }
     return pages;
   }
-  async discoverPages() {
-    const pages = [];
-    try {
-      const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
-      const response = await fetch(llmsUrl, {
-        headers: {
-          Accept: "text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper"
-        }
-      });
-      if (!response.ok) {
-        console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
-        return pages;
-      }
-      const content = await response.text();
-      const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
-      let match;
-      while ((match = linkRegex.exec(content)) !== null) {
-        const url = match[2];
-        const pagePath = match[3];
-        const pageName = pagePath.replace(".md", "");
-        const pathParts = pageName.split("/");
-        if (pathParts.length === 1) {
-          pages.push({ category: "", page: pathParts[0] });
-        } else if (pathParts.length === 2) {
-          pages.push({ category: pathParts[0], page: pathParts[1] });
-        } else {
-          const category = pathParts.slice(0, -1).join("/");
-          const page = pathParts[pathParts.length - 1];
-          pages.push({ category, page });
-        }
-      }
-      console.log(`Discovered ${pages.length} pages from llms.txt`);
-    } catch (error) {
-      console.error("Error discovering pages:", error);
-    }
-    return pages;
-  }
-  async discoverPagesHtml() {
-    const discovered = [];
-    try {
-      const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
-      const response = await fetch(indexUrl, {
-        headers: {
-          Accept: "text/html",
-          "User-Agent": "@ebowwa/markdown-docs-scraper"
-        }
-      });
-      if (!response.ok) {
-        return discovered;
-      }
-      const html = await response.text();
-      const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
-      let match;
-      while ((match = mdLinkRegex.exec(html)) !== null) {
-        const path = match[1];
-        if (!discovered.includes(path)) {
-          discovered.push(path);
-        }
-      }
-      console.log(`Discovered ${discovered.length} additional pages from HTML`);
-    } catch (error) {
-      console.error("Error discovering pages from HTML:", error);
-    }
-    return discovered;
-  }
 }
 async function scrapeMarkdownDocs(options) {
   const scraper = new MarkdownDocsScraper(options);
@@ -239,9 +253,40 @@ async function scrapeMarkdownDocs(options) {
   }
   return result;
 }
+var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
+var GENERIC_PATTERN = GENERIC_LINK_PATTERN;
+function claudeCodeOptions(outputDir) {
+  return {
+    baseUrl: "https://code.claude.com",
+    docsPath: "/docs/en",
+    llmsPaths: ["/docs/llms.txt"],
+    linkPattern: CLAUDE_CODE_PATTERN,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false
+  };
+}
+function polymarketOptions(outputDir) {
+  return {
+    baseUrl: "https://docs.polymarket.com",
+    docsPath: "",
+    llmsPaths: ["/llms.txt"],
+    linkPattern: GENERIC_PATTERN,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false
+  };
+}
 var src_default = MarkdownDocsScraper;
 export {
   scrapeMarkdownDocs,
+  polymarketOptions,
+  parsePagePath,
+  fetchMarkdown,
+  extractTitle,
   src_default as default,
-  MarkdownDocsScraper
+  claudeCodeOptions,
+  MarkdownDocsScraper,
+  GENERIC_PATTERN,
+  CLAUDE_CODE_PATTERN
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ebowwa/markdown-docs-scraper",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "description": "Scrape and mirror markdown-based documentation sites",
   "type": "module",
   "main": "./dist/index.js",

package/src/cli.ts CHANGED Viewed

@@ -19,12 +19,16 @@ program
   .option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
   .option("-c, --concurrency <num>", "Concurrency level", "5")
   .option("--discover", "Discover pages before scraping", false)
+  .option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
+  .option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
   .action(async (options) => {
     const scraperOptions: ScraperOptions = {
       baseUrl: options.url,
       docsPath: options.docsPath,
       outputDir: options.output,
       concurrency: parseInt(options.concurrency),
+      llmsPaths: options.llmsPaths.split(","),
+      tryDocsSubdomain: !options.noSubdomain,
     };
     console.log(`🔍 Scraping ${options.url}...`);
@@ -56,10 +60,14 @@ program
   .description("Discover all available documentation pages")
   .requiredOption("-u, --url <url>", "Base URL of the documentation site")
   .option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
+  .option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
+  .option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
   .action(async (options) => {
     const scraper = new MarkdownDocsScraper({
       baseUrl: options.url,
       docsPath: options.docsPath,
+      llmsPaths: options.llmsPaths.split(","),
+      tryDocsSubdomain: !options.noSubdomain,
     });
     console.log(`🔍 Discovering pages from ${options.url}...`);
@@ -67,7 +75,8 @@ program
     console.log(`\nFound ${pages.length} pages:\n`);
     pages.forEach((page) => {
-      console.log(`  - ${page}`);
+      const path = page.category ? `${page.category}/${page.page}` : page.page;
+      console.log(`  - ${path}`);
     });
   });

package/src/index.ts CHANGED Viewed

@@ -1,7 +1,10 @@
 /**
  * @ebowwa/markdown-docs-scraper
  *
- * Scrape and mirror markdown-based documentation sites
+ * Composable markdown documentation scraper.
+ * - Configurable llms.txt paths with fallbacks
+ * - Custom URL patterns for different doc sites
+ * - Works with any markdown documentation site
  */
 // ============================================================================
@@ -23,6 +26,12 @@ export interface ScraperOptions {
   outputDir?: string;
   concurrency?: number;
   onProgress?: (current: number, total: number) => void;
+  /** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
+  llmsPaths?: string[];
+  /** Also try docs subdomain variants (e.g., docs.example.com) */
+  tryDocsSubdomain?: boolean;
+  /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
+  linkPattern?: RegExp;
 }
 export interface ScraperResult {
@@ -31,8 +40,66 @@ export interface ScraperResult {
   duration: number;
 }
+/** Default pattern: matches /docs/en/ or /docs/ paths */
+const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
+/** Generic pattern: matches any .md links in llms.txt */
+const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
 // ============================================================================
-// SCRAPER
+// UTILITY FUNCTIONS (Composable)
+// ============================================================================
+/** Extract title from markdown content */
+export function extractTitle(markdown: string): string {
+  const titleMatch = markdown.match(/^#\s+(.+)$/m);
+  return titleMatch ? titleMatch[1].trim() : "Untitled";
+}
+/** Parse page path into category and page name */
+export function parsePagePath(pagePath: string): { category: string; page: string } {
+  // Remove .md extension
+  const pageName = pagePath.replace(".md", "");
+  // Check if there's a category in the path
+  const pathParts = pageName.split("/");
+  if (pathParts.length === 1) {
+    return { category: "", page: pathParts[0] };
+  } else if (pathParts.length === 2) {
+    return { category: pathParts[0], page: pathParts[1] };
+  } else {
+    // Deeper path: join everything except last as category
+    return {
+      category: pathParts.slice(0, -1).join("/"),
+      page: pathParts[pathParts.length - 1],
+    };
+  }
+}
+/** Fetch markdown content from URL */
+export async function fetchMarkdown(url: string, userAgent = "@ebowwa/markdown-docs-scraper"): Promise<string | null> {
+  try {
+    const response = await fetch(url, {
+      headers: {
+        Accept: "text/markdown, text/plain",
+        "User-Agent": userAgent,
+      },
+    });
+    if (!response.ok) {
+      return null;
+    }
+    return await response.text();
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error);
+    return null;
+  }
+}
+// ============================================================================
+// SCRAPER CLASS
 // ============================================================================
 export class MarkdownDocsScraper {
@@ -46,64 +113,23 @@ export class MarkdownDocsScraper {
       outputDir: options.outputDir || "./docs",
       concurrency: options.concurrency || 5,
       onProgress: options.onProgress || (() => {}),
+      llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
+      tryDocsSubdomain: options.tryDocsSubdomain ?? true,
+      linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
     };
   }
-  /**
-   * Fetch markdown content from a URL
-   */
-  async fetchMarkdown(url: string): Promise<string | null> {
-    try {
-      const response = await fetch(url, {
-        headers: {
-          Accept: "text/markdown, text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper",
-        },
-      });
-      if (!response.ok) {
-        return null;
-      }
-      const contentType = response.headers.get("content-type") || "";
-      if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
-        // Try to parse anyway - some sites return incorrect content-type
-      }
-      return await response.text();
-    } catch (error) {
-      console.error(`Error fetching ${url}:`, error);
-      return null;
-    }
-  }
-  /**
-   * Extract title from markdown content
-   */
-  extractTitle(markdown: string): string {
-    const titleMatch = markdown.match(/^#\s+(.+)$/m);
-    return titleMatch ? titleMatch[1].trim() : "Untitled";
-  }
-  /**
-   * Sanitize filename from URL path
-   */
-  sanitizeFilename(path: string): string {
-    return path
-      .toLowerCase()
-      .replace(/[^a-z0-9/]+/g, "-")
-      .replace(/^-|-$/g, "")
-      .replace(/\//g, "/");
-  }
   /**
    * Build URL for a documentation page
    */
   buildUrl(category: string, page: string): string {
     if (category) {
       return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
-    } else {
+    } else if (this.options.docsPath) {
       return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
+    } else {
+      // No docsPath (like Polymarket) - direct path
+      return `${this.options.baseUrl}/${page}.md`;
     }
   }
@@ -112,7 +138,7 @@ export class MarkdownDocsScraper {
    */
   async downloadPage(category: string, page: string): Promise<DocPage | null> {
     const url = this.buildUrl(category, page);
-    const content = await this.fetchMarkdown(url);
+    const content = await fetchMarkdown(url);
     if (!content) {
       return null;
@@ -120,13 +146,112 @@ export class MarkdownDocsScraper {
     return {
       url,
-      title: this.extractTitle(content),
+      title: extractTitle(content),
       content,
       category,
-      pageName: page,  // Store the page name for saving
+      pageName: page,
     };
   }
+  /**
+   * Generate possible llms.txt URLs to try
+   */
+  private getLlmsUrls(): string[] {
+    const urls: string[] = [];
+    const baseUrl = this.options.baseUrl;
+    // Try configured/custom paths first
+    for (const path of this.options.llmsPaths) {
+      urls.push(`${baseUrl}${path}`);
+    }
+    // Also try docs/doc subdomain variants if enabled
+    if (this.options.tryDocsSubdomain) {
+      try {
+        const url = new URL(baseUrl);
+        const hostname = url.hostname;
+        // Skip if already on docs/doc subdomain
+        if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
+          // Try docs.{domain}
+          const docsDomain = hostname.replace(/^www\./, "");
+          urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
+          urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
+        }
+      } catch {
+        // Invalid URL, skip subdomain variants
+      }
+    }
+    return urls;
+  }
+  /**
+   * Fetch llms.txt from multiple possible URLs with fallback
+   */
+  private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
+    const urls = this.getLlmsUrls();
+    for (const llmsUrl of urls) {
+      try {
+        const response = await fetch(llmsUrl, {
+          headers: {
+            Accept: "text/plain",
+            "User-Agent": "@ebowwa/markdown-docs-scraper",
+          },
+        });
+        if (response.ok) {
+          const content = await response.text();
+          console.log(`Found llms.txt at ${llmsUrl}`);
+          return { content, url: llmsUrl };
+        }
+      } catch (error) {
+        // Try next URL
+        continue;
+      }
+    }
+    return null;
+  }
+  /**
+   * Discover pages from llms.txt index
+   */
+  async discoverPages(): Promise<Array<{ category: string; page: string }>> {
+    const pages: Array<{ category: string; page: string }> = [];
+    try {
+      const llmsResult = await this.fetchLlmsTxt();
+      if (!llmsResult) {
+        const attemptedUrls = this.getLlmsUrls();
+        console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
+        return pages;
+      }
+      const { content } = llmsResult;
+      // Use provided pattern or default
+      const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
+      let match;
+      while ((match = regex.exec(content)) !== null) {
+        const url = match[2];
+        const pagePath = match[3]; // The captured path group
+        const { category, page } = parsePagePath(pagePath);
+        pages.push({ category, page });
+      }
+      console.log(`Discovered ${pages.length} pages from llms.txt`);
+    } catch (error) {
+      console.error("Error discovering pages:", error);
+    }
+    return pages;
+  }
   /**
    * Scrape pages discovered from llms.txt
    */
@@ -175,7 +300,7 @@ export class MarkdownDocsScraper {
   }
   /**
-   * Scrape all documentation pages
+   * Scrape all documentation pages (uses categories)
    */
   async scrape(): Promise<ScraperResult> {
     const startTime = Date.now();
@@ -255,102 +380,6 @@ export class MarkdownDocsScraper {
     return pages;
   }
-  /**
-   * Discover pages from llms.txt index
-   */
-  async discoverPages(): Promise<Array<{ category: string; page: string }>> {
-    const pages: Array<{ category: string; page: string }> = [];
-    try {
-      const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
-      const response = await fetch(llmsUrl, {
-        headers: {
-          Accept: "text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper",
-        },
-      });
-      if (!response.ok) {
-        console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
-        return pages;
-      }
-      const content = await response.text();
-      // Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
-      const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
-      let match;
-      while ((match = linkRegex.exec(content)) !== null) {
-        const url = match[2];
-        const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
-        // Remove .md extension
-        const pageName = pagePath.replace(".md", "");
-        // Check if there's a category in the path
-        const pathParts = pageName.split("/");
-        if (pathParts.length === 1) {
-          // No category: just "page-name"
-          pages.push({ category: "", page: pathParts[0] });
-        } else if (pathParts.length === 2) {
-          // Has category: "category/page-name"
-          pages.push({ category: pathParts[0], page: pathParts[1] });
-        } else {
-          // Deeper path: join everything except last as category
-          const category = pathParts.slice(0, -1).join("/");
-          const page = pathParts[pathParts.length - 1];
-          pages.push({ category, page });
-        }
-      }
-      console.log(`Discovered ${pages.length} pages from llms.txt`);
-    } catch (error) {
-      console.error("Error discovering pages:", error);
-    }
-    return pages;
-  }
-  /**
-   * Discover additional pages by parsing the docs index (fallback)
-   */
-  async discoverPagesHtml(): Promise<string[]> {
-    const discovered: string[] = [];
-    try {
-      const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
-      const response = await fetch(indexUrl, {
-        headers: {
-          Accept: "text/html",
-          "User-Agent": "@ebowwa/markdown-docs-scraper",
-        },
-      });
-      if (!response.ok) {
-        return discovered;
-      }
-      const html = await response.text();
-      const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
-      let match;
-      while ((match = mdLinkRegex.exec(html)) !== null) {
-        const path = match[1];
-        if (!discovered.includes(path)) {
-          discovered.push(path);
-        }
-      }
-      console.log(`Discovered ${discovered.length} additional pages from HTML`);
-    } catch (error) {
-      console.error("Error discovering pages from HTML:", error);
-    }
-    return discovered;
-  }
 }
 // ============================================================================
@@ -375,6 +404,42 @@ export async function scrapeMarkdownDocs(
   return result;
 }
+// ============================================================================
+// PRESET CONFIGURATIONS (Composable)
+// ============================================================================
+/** Pattern for Claude Code docs: /docs/en/page.md */
+export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
+/** Pattern for generic docs: any domain/path.md */
+export const GENERIC_PATTERN = GENERIC_LINK_PATTERN;
+/** Create scraper options for Claude Code docs */
+export function claudeCodeOptions(outputDir: string): ScraperOptions {
+  return {
+    baseUrl: "https://code.claude.com",
+    docsPath: "/docs/en",
+    llmsPaths: ["/docs/llms.txt"],
+    linkPattern: CLAUDE_CODE_PATTERN,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false,
+  };
+}
+/** Create scraper options for Polymarket docs */
+export function polymarketOptions(outputDir: string): ScraperOptions {
+  return {
+    baseUrl: "https://docs.polymarket.com",
+    docsPath: "",
+    llmsPaths: ["/llms.txt"],
+    linkPattern: GENERIC_PATTERN,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false,
+  };
+}
 // ============================================================================
 // EXPORTS
 // ============================================================================