npm - @ebowwa/markdown-docs-scraper - Versions diffs - 1.0.0 → 1.2.0 - Mend

@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +104 -0
package/dist/cli.d.ts +6 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +19 -13
package/dist/index.d.ts +116 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +323 -105
package/dist/scrapers/github-raw.d.ts +9 -0
package/dist/scrapers/github-raw.d.ts.map +1 -0
package/dist/scrapers/index.d.ts +11 -0
package/dist/scrapers/index.d.ts.map +1 -0
package/dist/scrapers/index.js +428 -0
package/dist/scrapers/llms-txt.d.ts +13 -0
package/dist/scrapers/llms-txt.d.ts.map +1 -0
package/dist/scrapers/registry.d.ts +23 -0
package/dist/scrapers/registry.d.ts.map +1 -0
package/dist/scrapers/types.d.ts +57 -0
package/dist/scrapers/types.d.ts.map +1 -0
package/package.json +10 -2
package/src/cli.js +160 -0
package/src/cli.ts +12 -1
package/src/index.js +487 -0
package/src/index.ts +276 -158
package/src/scrapers/github-raw.ts +154 -0
package/src/scrapers/index.ts +16 -0
package/src/scrapers/llms-txt.ts +101 -0
package/src/scrapers/registry.ts +55 -0
package/src/scrapers/types.ts +79 -0

package/dist/index.js CHANGED Viewed

@@ -17,7 +17,199 @@ var __toESM = (mod, isNodeMode, target) => {
 };
 var __require = /* @__PURE__ */ createRequire(import.meta.url);
+// src/scrapers/llms-txt.ts
+var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
+var GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
+var llmsTxtScraper = {
+  type: "llms-txt",
+  async scrape(config) {
+    const options = getScraperOptions(config);
+    const result = await scrapeMarkdownDocs(options);
+    const downloaded = result.downloaded.map((page) => {
+      const category = page.category || "";
+      const filename = `${page.pageName || "untitled"}.md`;
+      const path = category ? `${category}/${filename}` : filename;
+      return {
+        success: true,
+        path,
+        title: page.title
+      };
+    });
+    return {
+      downloaded,
+      failed: result.failed,
+      duration: result.duration
+    };
+  }
+};
+function getScraperOptions(config) {
+  const baseOptions = {
+    baseUrl: config.baseUrl,
+    docsPath: config.docsPath,
+    outputDir: config.outputDir,
+    concurrency: 10,
+    useLlms: true,
+    tryDocsSubdomain: false
+  };
+  if (config.name === "Claude Code") {
+    return {
+      ...baseOptions,
+      llmsPaths: ["/docs/llms.txt"],
+      linkPattern: CLAUDE_CODE_PATTERN
+    };
+  }
+  if (config.name === "Polymarket") {
+    return {
+      ...baseOptions,
+      llmsPaths: ["/llms.txt"],
+      linkPattern: GENERIC_PATTERN
+    };
+  }
+  if (config.name === "Bun") {
+    return {
+      ...baseOptions,
+      llmsPaths: ["/docs/llms.txt", "/llms.txt"],
+      linkPattern: GENERIC_PATTERN
+    };
+  }
+  return {
+    ...baseOptions,
+    llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
+    linkPattern: config.linkPattern || GENERIC_PATTERN
+  };
+}
+// src/scrapers/github-raw.ts
+var githubRawScraper = {
+  type: "github-raw",
+  async scrape(config) {
+    const startTime = Date.now();
+    const downloaded = [];
+    const failed = [];
+    if (!config.github?.repo) {
+      throw new Error(`GitHub source "${config.name}" missing github.repo config`);
+    }
+    const files = await fetchGitHubMarkdownFiles(config.github.repo, config.docsPath.replace(/^\//, ""));
+    for (const file of files) {
+      const content = await fetchGitHubRawContent(config.github.repo, file.path);
+      if (content) {
+        downloaded.push({
+          success: true,
+          path: file.name,
+          title: extractTitle(content) || file.name.replace(".md", "")
+        });
+        await saveFile(config.outputDir, file.name, content);
+      } else {
+        failed.push({
+          url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
+          error: "Failed to fetch content"
+        });
+      }
+    }
+    return {
+      downloaded,
+      failed,
+      duration: Date.now() - startTime
+    };
+  }
+};
+async function fetchGitHubMarkdownFiles(repo, path) {
+  const url = `https://api.github.com/repos/${repo}/contents/${path}`;
+  const response = await fetch(url, {
+    headers: {
+      Accept: "application/vnd.github.v3+json",
+      "User-Agent": "@ebowwa/markdown-docs-scraper"
+    }
+  });
+  if (!response.ok) {
+    throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
+  }
+  const contents = await response.json();
+  return contents.filter((item) => item.type === "file" && item.name.endsWith(".md"));
+}
+async function fetchGitHubRawContent(repo, path) {
+  const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
+  try {
+    const response = await fetch(url, {
+      headers: {
+        Accept: "text/plain",
+        "User-Agent": "@ebowwa/markdown-docs-scraper"
+      }
+    });
+    if (!response.ok) {
+      return null;
+    }
+    return await response.text();
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error);
+    return null;
+  }
+}
+function extractTitle(markdown) {
+  const titleMatch = markdown.match(/^#\s+(.+)$/m);
+  return titleMatch ? titleMatch[1].trim() : null;
+}
+async function saveFile(outputDir, filename, content) {
+  const fs = await import("fs/promises");
+  const path = await import("path");
+  const outputPath = path.join(outputDir, filename);
+  await fs.mkdir(path.dirname(outputPath), { recursive: true });
+  await fs.writeFile(outputPath, content, "utf-8");
+}
+// src/scrapers/registry.ts
+var scrapers = new Map;
+function registerScraper(scraper) {
+  scrapers.set(scraper.type, scraper);
+}
+function getScraper(type) {
+  return scrapers.get(type);
+}
+async function scrapeSource(config) {
+  const scraper = scrapers.get(config.sourceType);
+  if (!scraper) {
+    throw new Error(`No scraper registered for type: ${config.sourceType}`);
+  }
+  return scraper.scrape(config);
+}
+registerScraper(llmsTxtScraper);
+registerScraper(githubRawScraper);
 // src/index.ts
+var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
+function extractTitle2(markdown) {
+  const titleMatch = markdown.match(/^#\s+(.+)$/m);
+  return titleMatch ? titleMatch[1].trim() : "Untitled";
+}
+function parsePagePath(pagePath) {
+  const pageName = pagePath.replace(".md", "");
+  const pathParts = pageName.split("/");
+  if (pathParts.length === 1) {
+    return { category: "", page: pathParts[0] };
+  } else if (pathParts.length === 2) {
+    return { category: pathParts[0], page: pathParts[1] };
+  } else {
+    return {
+      category: pathParts.slice(0, -1).join("/"),
+      page: pathParts[pathParts.length - 1]
+    };
+  }
+}
+async function fetchMarkdown(url, userAgent = "@ebowwa/markdown-docs-scraper") {
+  try {
+    const response = await fetch(url, {
+      headers: {
+        Accept: "text/markdown, text/plain",
+        "User-Agent": userAgent
+      }
+    });
+    if (!response.ok) {
+      return null;
+    }
+    return await response.text();
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error);
+    return null;
+  }
+}
 class MarkdownDocsScraper {
   options;
   constructor(options) {
@@ -27,56 +219,107 @@ class MarkdownDocsScraper {
       categories: options.categories || {},
       outputDir: options.outputDir || "./docs",
       concurrency: options.concurrency || 5,
-      onProgress: options.onProgress || (() => {})
+      onProgress: options.onProgress || (() => {}),
+      llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
+      tryDocsSubdomain: options.tryDocsSubdomain ?? true,
+      linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
+      useDirectUrls: options.useDirectUrls ?? true
     };
   }
-  async fetchMarkdown(url) {
-    try {
-      const response = await fetch(url, {
-        headers: {
-          Accept: "text/markdown, text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper"
-        }
-      });
-      if (!response.ok) {
-        return null;
-      }
-      const contentType = response.headers.get("content-type") || "";
-      if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {}
-      return await response.text();
-    } catch (error) {
-      console.error(`Error fetching ${url}:`, error);
-      return null;
-    }
-  }
-  extractTitle(markdown) {
-    const titleMatch = markdown.match(/^#\s+(.+)$/m);
-    return titleMatch ? titleMatch[1].trim() : "Untitled";
-  }
-  sanitizeFilename(path) {
-    return path.toLowerCase().replace(/[^a-z0-9/]+/g, "-").replace(/^-|-$/g, "").replace(/\//g, "/");
-  }
   buildUrl(category, page) {
     if (category) {
       return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
-    } else {
+    } else if (this.options.docsPath) {
       return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
+    } else {
+      return `${this.options.baseUrl}/${page}.md`;
     }
   }
-  async downloadPage(category, page) {
-    const url = this.buildUrl(category, page);
-    const content = await this.fetchMarkdown(url);
+  async downloadPage(pageInfo) {
+    const url = this.options.useDirectUrls && pageInfo.fullUrl ? pageInfo.fullUrl : this.buildUrl(pageInfo.category, pageInfo.page);
+    const content = await fetchMarkdown(url);
     if (!content) {
       return null;
     }
     return {
       url,
-      title: this.extractTitle(content),
+      title: extractTitle2(content),
       content,
-      category,
-      pageName: page
+      category: pageInfo.category,
+      pageName: pageInfo.page
     };
   }
+  getLlmsUrls() {
+    const urls = [];
+    const baseUrl = this.options.baseUrl;
+    for (const path of this.options.llmsPaths) {
+      urls.push(`${baseUrl}${path}`);
+    }
+    if (this.options.tryDocsSubdomain) {
+      try {
+        const url = new URL(baseUrl);
+        const hostname = url.hostname;
+        if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
+          const docsDomain = hostname.replace(/^www\./, "");
+          urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
+          urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
+        }
+      } catch {}
+    }
+    return urls;
+  }
+  async fetchLlmsTxt() {
+    const urls = this.getLlmsUrls();
+    console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
+    for (const llmsUrl of urls) {
+      try {
+        console.log(`DEBUG: Fetching ${llmsUrl}...`);
+        const response = await fetch(llmsUrl, {
+          headers: {
+            Accept: "text/plain",
+            "User-Agent": "@ebowwa/markdown-docs-scraper"
+          }
+        });
+        console.log(`DEBUG: Response status: ${response.status}`);
+        if (response.ok) {
+          const content = await response.text();
+          console.log(`Found llms.txt at ${llmsUrl}`);
+          return { content, url: llmsUrl };
+        }
+      } catch (error) {
+        console.log(`DEBUG: Error: ${error}`);
+        continue;
+      }
+    }
+    return null;
+  }
+  async discoverPages() {
+    const pages = [];
+    try {
+      const llmsResult = await this.fetchLlmsTxt();
+      if (!llmsResult) {
+        const attemptedUrls = this.getLlmsUrls();
+        console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
+        return pages;
+      }
+      const { content } = llmsResult;
+      const pattern = this.options.linkPattern;
+      const regex = new RegExp(pattern.source, pattern.flags);
+      let match;
+      console.log(`DEBUG: Using pattern: ${pattern.source}`);
+      console.log(`DEBUG: Content length: ${content.length}`);
+      while ((match = regex.exec(content)) !== null) {
+        const fullUrl = match[2];
+        const pagePath = match[3];
+        const { category, page } = parsePagePath(pagePath);
+        pages.push({ category, page, fullUrl });
+      }
+      console.log(`Discovered ${pages.length} pages from llms.txt`);
+    } catch (error) {
+      console.error("Error discovering pages:", error);
+    }
+    return pages;
+  }
   async scrapeFromLlms() {
     const startTime = Date.now();
     const downloaded = [];
@@ -89,14 +332,15 @@ class MarkdownDocsScraper {
     console.log(`Scraping ${pages.length} discovered pages...`);
     for (let i = 0;i < pages.length; i += this.options.concurrency) {
       const batch = pages.slice(i, i + this.options.concurrency);
-      const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
+      const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page)));
       results.forEach((result, index) => {
         const page = batch[index];
         if (result.status === "fulfilled" && result.value) {
           downloaded.push(result.value);
         } else {
+          const url = this.options.useDirectUrls && page.fullUrl ? page.fullUrl : this.buildUrl(page.category, page.page);
           failed.push({
-            url: this.buildUrl(page.category, page.page),
+            url,
             error: result.status === "rejected" ? result.reason : "Not found"
           });
         }
@@ -118,7 +362,7 @@ class MarkdownDocsScraper {
     console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
     for (let i = 0;i < pages.length; i += this.options.concurrency) {
       const batch = pages.slice(i, i + this.options.concurrency);
-      const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
+      const results = await Promise.allSettled(batch.map((page) => this.downloadPage({ ...page, fullUrl: "" })));
       results.forEach((result, index) => {
         const page = batch[index];
         if (result.status === "fulfilled" && result.value) {
@@ -159,77 +403,11 @@ Downloaded: ${new Date().toISOString()}
     const pages = [];
     for (const [category, pageList] of Object.entries(this.options.categories)) {
       for (const page of pageList) {
-        pages.push({ category, page });
+        pages.push({ category, page, fullUrl: "" });
       }
     }
     return pages;
   }
-  async discoverPages() {
-    const pages = [];
-    try {
-      const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
-      const response = await fetch(llmsUrl, {
-        headers: {
-          Accept: "text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper"
-        }
-      });
-      if (!response.ok) {
-        console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
-        return pages;
-      }
-      const content = await response.text();
-      const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
-      let match;
-      while ((match = linkRegex.exec(content)) !== null) {
-        const url = match[2];
-        const pagePath = match[3];
-        const pageName = pagePath.replace(".md", "");
-        const pathParts = pageName.split("/");
-        if (pathParts.length === 1) {
-          pages.push({ category: "", page: pathParts[0] });
-        } else if (pathParts.length === 2) {
-          pages.push({ category: pathParts[0], page: pathParts[1] });
-        } else {
-          const category = pathParts.slice(0, -1).join("/");
-          const page = pathParts[pathParts.length - 1];
-          pages.push({ category, page });
-        }
-      }
-      console.log(`Discovered ${pages.length} pages from llms.txt`);
-    } catch (error) {
-      console.error("Error discovering pages:", error);
-    }
-    return pages;
-  }
-  async discoverPagesHtml() {
-    const discovered = [];
-    try {
-      const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
-      const response = await fetch(indexUrl, {
-        headers: {
-          Accept: "text/html",
-          "User-Agent": "@ebowwa/markdown-docs-scraper"
-        }
-      });
-      if (!response.ok) {
-        return discovered;
-      }
-      const html = await response.text();
-      const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
-      let match;
-      while ((match = mdLinkRegex.exec(html)) !== null) {
-        const path = match[1];
-        if (!discovered.includes(path)) {
-          discovered.push(path);
-        }
-      }
-      console.log(`Discovered ${discovered.length} additional pages from HTML`);
-    } catch (error) {
-      console.error("Error discovering pages from HTML:", error);
-    }
-    return discovered;
-  }
 }
 async function scrapeMarkdownDocs(options) {
   const scraper = new MarkdownDocsScraper(options);
@@ -239,9 +417,49 @@ async function scrapeMarkdownDocs(options) {
   }
   return result;
 }
+var CLAUDE_CODE_PATTERN2 = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
+var GENERIC_PATTERN2 = GENERIC_LINK_PATTERN;
+function claudeCodeOptions(outputDir) {
+  return {
+    baseUrl: "https://code.claude.com",
+    docsPath: "/docs/en",
+    llmsPaths: ["/docs/llms.txt"],
+    linkPattern: CLAUDE_CODE_PATTERN2,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false,
+    useDirectUrls: false
+  };
+}
+function polymarketOptions(outputDir) {
+  return {
+    baseUrl: "https://docs.polymarket.com",
+    docsPath: "",
+    llmsPaths: ["/llms.txt"],
+    linkPattern: GENERIC_PATTERN2,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false,
+    useDirectUrls: true
+  };
+}
 var src_default = MarkdownDocsScraper;
 export {
+  scrapeSource,
   scrapeMarkdownDocs,
+  registerScraper,
+  polymarketOptions,
+  parsePagePath,
+  llmsTxtScraper,
+  githubRawScraper,
+  getScraper,
+  fetchMarkdown,
+  extractTitle2 as extractTitle,
   src_default as default,
-  MarkdownDocsScraper
+  claudeCodeOptions,
+  GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
+  CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
+  MarkdownDocsScraper,
+  GENERIC_PATTERN2 as GENERIC_PATTERN,
+  CLAUDE_CODE_PATTERN2 as CLAUDE_CODE_PATTERN
 };

package/dist/scrapers/github-raw.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * GitHub Raw Scraper
+ *
+ * Downloads markdown files directly from GitHub repositories via raw content URLs.
+ * Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
+ */
+import type { Scraper } from "./types";
+export declare const githubRawScraper: Scraper;
+//# sourceMappingURL=github-raw.d.ts.map

package/dist/scrapers/github-raw.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"github-raw.d.ts","sourceRoot":"","sources":["../../src/scrapers/github-raw.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,OAAO,EAA8C,MAAM,SAAS,CAAC;AAiBnF,eAAO,MAAM,gBAAgB,EAAE,OA6C9B,CAAC"}

package/dist/scrapers/index.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+/**
+ * Scrapers Module
+ *
+ * Composable scraper architecture for multiple documentation source types.
+ * This module provides a registry-based system for different scraper implementations.
+ */
+export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
+export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
+export { githubRawScraper } from "./github-raw";
+export { registerScraper, getScraper, scrapeSource } from "./registry";
+//# sourceMappingURL=index.d.ts.map

package/dist/scrapers/index.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scrapers/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,YAAY,EAAE,UAAU,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AAG/F,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAClF,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAGhD,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC"}