npm - @ebowwa/markdown-docs-scraper - Versions diffs - 1.0.0 → 1.2.0 - Mend

@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +104 -0
package/dist/cli.d.ts +6 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +19 -13
package/dist/index.d.ts +116 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +323 -105
package/dist/scrapers/github-raw.d.ts +9 -0
package/dist/scrapers/github-raw.d.ts.map +1 -0
package/dist/scrapers/index.d.ts +11 -0
package/dist/scrapers/index.d.ts.map +1 -0
package/dist/scrapers/index.js +428 -0
package/dist/scrapers/llms-txt.d.ts +13 -0
package/dist/scrapers/llms-txt.d.ts.map +1 -0
package/dist/scrapers/registry.d.ts +23 -0
package/dist/scrapers/registry.d.ts.map +1 -0
package/dist/scrapers/types.d.ts +57 -0
package/dist/scrapers/types.d.ts.map +1 -0
package/package.json +10 -2
package/src/cli.js +160 -0
package/src/cli.ts +12 -1
package/src/index.js +487 -0
package/src/index.ts +276 -158
package/src/scrapers/github-raw.ts +154 -0
package/src/scrapers/index.ts +16 -0
package/src/scrapers/llms-txt.ts +101 -0
package/src/scrapers/registry.ts +55 -0
package/src/scrapers/types.ts +79 -0

package/src/index.ts CHANGED Viewed

@@ -1,7 +1,11 @@
 /**
  * @ebowwa/markdown-docs-scraper
  *
- * Scrape and mirror markdown-based documentation sites
+ * Composable markdown documentation scraper.
+ * - Configurable llms.txt paths with fallbacks
+ * - Custom URL patterns for different doc sites
+ * - Works with any markdown documentation site
+ * - Uses full URLs from llms.txt directly
  */
 // ============================================================================
@@ -23,6 +27,14 @@ export interface ScraperOptions {
   outputDir?: string;
   concurrency?: number;
   onProgress?: (current: number, total: number) => void;
+  /** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
+  llmsPaths?: string[];
+  /** Also try docs subdomain variants (e.g., docs.example.com) */
+  tryDocsSubdomain?: boolean;
+  /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
+  linkPattern?: RegExp;
+  /** Use full URLs from llms.txt directly (default: true for generic pattern) */
+  useDirectUrls?: boolean;
 }
 export interface ScraperResult {
@@ -31,8 +43,73 @@ export interface ScraperResult {
   duration: number;
 }
+/** Discovered page with full URL */
+interface DiscoveredPage {
+  category: string;
+  page: string;
+  fullUrl: string;  // The complete URL from llms.txt
+}
+/** Default pattern: matches /docs/en/ or /docs/ paths */
+const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
+/** Generic pattern: matches any .md links - captures full path after domain */
+const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
+// ============================================================================
+// UTILITY FUNCTIONS (Composable)
+// ============================================================================
+/** Extract title from markdown content */
+export function extractTitle(markdown: string): string {
+  const titleMatch = markdown.match(/^#\s+(.+)$/m);
+  return titleMatch ? titleMatch[1].trim() : "Untitled";
+}
+/** Parse page path into category and page name */
+export function parsePagePath(pagePath: string): { category: string; page: string } {
+  // Remove .md extension
+  const pageName = pagePath.replace(".md", "");
+  // Check if there's a category in the path
+  const pathParts = pageName.split("/");
+  if (pathParts.length === 1) {
+    return { category: "", page: pathParts[0] };
+  } else if (pathParts.length === 2) {
+    return { category: pathParts[0], page: pathParts[1] };
+  } else {
+    // Deeper path: join everything except last as category
+    return {
+      category: pathParts.slice(0, -1).join("/"),
+      page: pathParts[pathParts.length - 1],
+    };
+  }
+}
+/** Fetch markdown content from URL */
+export async function fetchMarkdown(url: string, userAgent = "@ebowwa/markdown-docs-scraper"): Promise<string | null> {
+  try {
+    const response = await fetch(url, {
+      headers: {
+        Accept: "text/markdown, text/plain",
+        "User-Agent": userAgent,
+      },
+    });
+    if (!response.ok) {
+      return null;
+    }
+    return await response.text();
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error);
+    return null;
+  }
+}
 // ============================================================================
-// SCRAPER
+// SCRAPER CLASS
 // ============================================================================
 export class MarkdownDocsScraper {
@@ -46,85 +123,154 @@ export class MarkdownDocsScraper {
       outputDir: options.outputDir || "./docs",
       concurrency: options.concurrency || 5,
       onProgress: options.onProgress || (() => {}),
+      llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
+      tryDocsSubdomain: options.tryDocsSubdomain ?? true,
+      linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
+      useDirectUrls: options.useDirectUrls ?? true,
     };
   }
   /**
-   * Fetch markdown content from a URL
+   * Build URL for a documentation page (fallback when no direct URL)
    */
-  async fetchMarkdown(url: string): Promise<string | null> {
-    try {
-      const response = await fetch(url, {
-        headers: {
-          Accept: "text/markdown, text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper",
-        },
-      });
+  buildUrl(category: string, page: string): string {
+    if (category) {
+      return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
+    } else if (this.options.docsPath) {
+      return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
+    } else {
+      return `${this.options.baseUrl}/${page}.md`;
+    }
+  }
-      if (!response.ok) {
-        return null;
-      }
+  /**
+   * Download a page using either direct URL or built URL
+   */
+  async downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null> {
+    // Use direct URL if available and useDirectUrls is enabled
+    const url = (this.options.useDirectUrls && pageInfo.fullUrl)
+      ? pageInfo.fullUrl
+      : this.buildUrl(pageInfo.category, pageInfo.page);
-      const contentType = response.headers.get("content-type") || "";
-      if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
-        // Try to parse anyway - some sites return incorrect content-type
-      }
+    const content = await fetchMarkdown(url);
-      return await response.text();
-    } catch (error) {
-      console.error(`Error fetching ${url}:`, error);
+    if (!content) {
       return null;
     }
-  }
-  /**
-   * Extract title from markdown content
-   */
-  extractTitle(markdown: string): string {
-    const titleMatch = markdown.match(/^#\s+(.+)$/m);
-    return titleMatch ? titleMatch[1].trim() : "Untitled";
+    return {
+      url,
+      title: extractTitle(content),
+      content,
+      category: pageInfo.category,
+      pageName: pageInfo.page,
+    };
   }
   /**
-   * Sanitize filename from URL path
+   * Generate possible llms.txt URLs to try
    */
-  sanitizeFilename(path: string): string {
-    return path
-      .toLowerCase()
-      .replace(/[^a-z0-9/]+/g, "-")
-      .replace(/^-|-$/g, "")
-      .replace(/\//g, "/");
+  private getLlmsUrls(): string[] {
+    const urls: string[] = [];
+    const baseUrl = this.options.baseUrl;
+    // Try configured/custom paths first
+    for (const path of this.options.llmsPaths) {
+      urls.push(`${baseUrl}${path}`);
+    }
+    // Also try docs/doc subdomain variants if enabled
+    if (this.options.tryDocsSubdomain) {
+      try {
+        const url = new URL(baseUrl);
+        const hostname = url.hostname;
+        // Skip if already on docs/doc subdomain
+        if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
+          const docsDomain = hostname.replace(/^www\./, "");
+          urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
+          urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
+        }
+      } catch {
+        // Invalid URL, skip subdomain variants
+      }
+    }
+    return urls;
   }
   /**
-   * Build URL for a documentation page
+   * Fetch llms.txt from multiple possible URLs with fallback
    */
-  buildUrl(category: string, page: string): string {
-    if (category) {
-      return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
-    } else {
-      return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
+  private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
+    const urls = this.getLlmsUrls();
+    console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
+    for (const llmsUrl of urls) {
+      try {
+        console.log(`DEBUG: Fetching ${llmsUrl}...`);
+        const response = await fetch(llmsUrl, {
+          headers: {
+            Accept: "text/plain",
+            "User-Agent": "@ebowwa/markdown-docs-scraper",
+          },
+        });
+        console.log(`DEBUG: Response status: ${response.status}`);
+        if (response.ok) {
+          const content = await response.text();
+          console.log(`Found llms.txt at ${llmsUrl}`);
+          return { content, url: llmsUrl };
+        }
+      } catch (error) {
+        console.log(`DEBUG: Error: ${error}`);
+        continue;
+      }
     }
+    return null;
   }
   /**
-   * Download a single documentation page
+   * Discover pages from llms.txt index
    */
-  async downloadPage(category: string, page: string): Promise<DocPage | null> {
-    const url = this.buildUrl(category, page);
-    const content = await this.fetchMarkdown(url);
+  async discoverPages(): Promise<DiscoveredPage[]> {
+    const pages: DiscoveredPage[] = [];
-    if (!content) {
-      return null;
+    try {
+      const llmsResult = await this.fetchLlmsTxt();
+      if (!llmsResult) {
+        const attemptedUrls = this.getLlmsUrls();
+        console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
+        return pages;
+      }
+      const { content } = llmsResult;
+      // Use provided pattern or default
+      const pattern = this.options.linkPattern;
+      const regex = new RegExp(pattern.source, pattern.flags);
+      let match;
+      // Debug: log pattern being used
+      console.log(`DEBUG: Using pattern: ${pattern.source}`);
+      console.log(`DEBUG: Content length: ${content.length}`);
+      while ((match = regex.exec(content)) !== null) {
+        const fullUrl = match[2];  // The full URL from llms.txt
+        const pagePath = match[3]; // The captured path group
+        const { category, page } = parsePagePath(pagePath);
+        pages.push({ category, page, fullUrl });
+      }
+      console.log(`Discovered ${pages.length} pages from llms.txt`);
+    } catch (error) {
+      console.error("Error discovering pages:", error);
     }
-    return {
-      url,
-      title: this.extractTitle(content),
-      content,
-      category,
-      pageName: page,  // Store the page name for saving
-    };
+    return pages;
   }
   /**
@@ -148,7 +294,7 @@ export class MarkdownDocsScraper {
     for (let i = 0; i < pages.length; i += this.options.concurrency) {
       const batch = pages.slice(i, i + this.options.concurrency);
       const results = await Promise.allSettled(
-        batch.map((page) => this.downloadPage(page.category, page.page))
+        batch.map((page) => this.downloadPage(page))
       );
       results.forEach((result, index) => {
@@ -156,8 +302,11 @@ export class MarkdownDocsScraper {
         if (result.status === "fulfilled" && result.value) {
           downloaded.push(result.value);
         } else {
+          const url = (this.options.useDirectUrls && page.fullUrl)
+            ? page.fullUrl
+            : this.buildUrl(page.category, page.page);
           failed.push({
-            url: this.buildUrl(page.category, page.page),
+            url,
             error: result.status === "rejected" ? (result.reason as string) : "Not found",
           });
         }
@@ -175,7 +324,7 @@ export class MarkdownDocsScraper {
   }
   /**
-   * Scrape all documentation pages
+   * Scrape all documentation pages (uses categories)
    */
   async scrape(): Promise<ScraperResult> {
     const startTime = Date.now();
@@ -191,7 +340,7 @@ export class MarkdownDocsScraper {
     for (let i = 0; i < pages.length; i += this.options.concurrency) {
       const batch = pages.slice(i, i + this.options.concurrency);
       const results = await Promise.allSettled(
-        batch.map((page) => this.downloadPage(page.category, page.page))
+        batch.map((page) => this.downloadPage({ ...page, fullUrl: "" }))
       );
       results.forEach((result, index) => {
@@ -225,7 +374,6 @@ export class MarkdownDocsScraper {
     const path = await import("path");
     for (const page of pages) {
-      // Use pageName if available, otherwise extract from URL
       const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
       const dir = page.category
@@ -244,113 +392,17 @@ export class MarkdownDocsScraper {
   /**
    * Get list of pages to scrape based on categories
    */
-  private getPagesToScrape(): Array<{ category: string; page: string }> {
-    const pages: Array<{ category: string; page: string }> = [];
+  private getPagesToScrape(): DiscoveredPage[] {
+    const pages: DiscoveredPage[] = [];
     for (const [category, pageList] of Object.entries(this.options.categories)) {
       for (const page of pageList) {
-        pages.push({ category, page });
+        pages.push({ category, page, fullUrl: "" });
       }
     }
     return pages;
   }
-  /**
-   * Discover pages from llms.txt index
-   */
-  async discoverPages(): Promise<Array<{ category: string; page: string }>> {
-    const pages: Array<{ category: string; page: string }> = [];
-    try {
-      const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
-      const response = await fetch(llmsUrl, {
-        headers: {
-          Accept: "text/plain",
-          "User-Agent": "@ebowwa/markdown-docs-scraper",
-        },
-      });
-      if (!response.ok) {
-        console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
-        return pages;
-      }
-      const content = await response.text();
-      // Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
-      const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
-      let match;
-      while ((match = linkRegex.exec(content)) !== null) {
-        const url = match[2];
-        const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
-        // Remove .md extension
-        const pageName = pagePath.replace(".md", "");
-        // Check if there's a category in the path
-        const pathParts = pageName.split("/");
-        if (pathParts.length === 1) {
-          // No category: just "page-name"
-          pages.push({ category: "", page: pathParts[0] });
-        } else if (pathParts.length === 2) {
-          // Has category: "category/page-name"
-          pages.push({ category: pathParts[0], page: pathParts[1] });
-        } else {
-          // Deeper path: join everything except last as category
-          const category = pathParts.slice(0, -1).join("/");
-          const page = pathParts[pathParts.length - 1];
-          pages.push({ category, page });
-        }
-      }
-      console.log(`Discovered ${pages.length} pages from llms.txt`);
-    } catch (error) {
-      console.error("Error discovering pages:", error);
-    }
-    return pages;
-  }
-  /**
-   * Discover additional pages by parsing the docs index (fallback)
-   */
-  async discoverPagesHtml(): Promise<string[]> {
-    const discovered: string[] = [];
-    try {
-      const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
-      const response = await fetch(indexUrl, {
-        headers: {
-          Accept: "text/html",
-          "User-Agent": "@ebowwa/markdown-docs-scraper",
-        },
-      });
-      if (!response.ok) {
-        return discovered;
-      }
-      const html = await response.text();
-      const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
-      let match;
-      while ((match = mdLinkRegex.exec(html)) !== null) {
-        const path = match[1];
-        if (!discovered.includes(path)) {
-          discovered.push(path);
-        }
-      }
-      console.log(`Discovered ${discovered.length} additional pages from HTML`);
-    } catch (error) {
-      console.error("Error discovering pages from HTML:", error);
-    }
-    return discovered;
-  }
 }
 // ============================================================================
@@ -375,6 +427,72 @@ export async function scrapeMarkdownDocs(
   return result;
 }
+// ============================================================================
+// PRESET CONFIGURATIONS (Composable)
+// ============================================================================
+/** Pattern for Claude Code docs: /docs/en/page.md */
+export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
+/** Pattern for generic docs: any domain/path.md */
+export const GENERIC_PATTERN = GENERIC_LINK_PATTERN;
+/** Create scraper options for Claude Code docs */
+export function claudeCodeOptions(outputDir: string): ScraperOptions {
+  return {
+    baseUrl: "https://code.claude.com",
+    docsPath: "/docs/en",
+    llmsPaths: ["/docs/llms.txt"],
+    linkPattern: CLAUDE_CODE_PATTERN,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false,
+    useDirectUrls: false,  // Claude Code can use built URLs
+  };
+}
+/** Create scraper options for Polymarket docs */
+export function polymarketOptions(outputDir: string): ScraperOptions {
+  return {
+    baseUrl: "https://docs.polymarket.com",
+    docsPath: "",
+    llmsPaths: ["/llms.txt"],
+    linkPattern: GENERIC_PATTERN,
+    outputDir,
+    concurrency: 10,
+    tryDocsSubdomain: false,
+    useDirectUrls: true,  // Polymarket needs direct URLs
+  };
+}
+// ============================================================================
+// SCRAPERS MODULE
+// ============================================================================
+/**
+ * Re-export scrapers module for composable scraper architecture.
+ * This provides a registry-based system for different scraper implementations.
+ */
+export {
+  // Types
+  type SourceType,
+  type SourceConfig,
+  type Scraper,
+  type ScrapeResult as ScraperModuleResult,
+  type DownloadResult,
+  // Scrapers
+  llmsTxtScraper,
+  githubRawScraper,
+  CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
+  GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
+  // Registry
+  registerScraper,
+  getScraper,
+  scrapeSource,
+} from "./scrapers/index";
 // ============================================================================
 // EXPORTS
 // ============================================================================

package/src/scrapers/github-raw.ts ADDED Viewed

@@ -0,0 +1,154 @@
+/**
+ * GitHub Raw Scraper
+ *
+ * Downloads markdown files directly from GitHub repositories via raw content URLs.
+ * Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
+ */
+import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
+// ============================================================================
+// GITHUB API TYPES
+// ============================================================================
+interface GitHubContent {
+  name: string;
+  path: string;
+  download_url: string;
+  type: string;
+}
+// ============================================================================
+// GITHUB RAW SCRAPER
+// ============================================================================
+export const githubRawScraper: Scraper = {
+  type: "github-raw",
+  async scrape(config: SourceConfig): Promise<ScrapeResult> {
+    const startTime = Date.now();
+    const downloaded: DownloadResult[] = [];
+    const failed: Array<{ url: string; error: string }> = [];
+    if (!config.github?.repo) {
+      throw new Error(`GitHub source "${config.name}" missing github.repo config`);
+    }
+    // Get list of markdown files from GitHub API
+    const files = await fetchGitHubMarkdownFiles(
+      config.github.repo,
+      config.docsPath.replace(/^\//, "")
+    );
+    // Download each file
+    for (const file of files) {
+      const content = await fetchGitHubRawContent(config.github.repo, file.path);
+      if (content) {
+        downloaded.push({
+          success: true,
+          path: file.name,
+          title: extractTitle(content) || file.name.replace(".md", ""),
+        });
+        // Save the file
+        await saveFile(config.outputDir, file.name, content);
+      } else {
+        failed.push({
+          url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
+          error: "Failed to fetch content",
+        });
+      }
+    }
+    return {
+      downloaded,
+      failed,
+      duration: Date.now() - startTime,
+    };
+  },
+};
+// ============================================================================
+// GITHUB API FUNCTIONS
+// ============================================================================
+/**
+ * Fetch list of markdown files from GitHub repo directory
+ */
+async function fetchGitHubMarkdownFiles(
+  repo: string,
+  path: string
+): Promise<GitHubContent[]> {
+  const url = `https://api.github.com/repos/${repo}/contents/${path}`;
+  const response = await fetch(url, {
+    headers: {
+      Accept: "application/vnd.github.v3+json",
+      "User-Agent": "@ebowwa/markdown-docs-scraper",
+    },
+  });
+  if (!response.ok) {
+    throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
+  }
+  const contents: GitHubContent[] = await response.json();
+  // Filter for markdown files only
+  return contents.filter(
+    (item) => item.type === "file" && item.name.endsWith(".md")
+  );
+}
+/**
+ * Download markdown content from GitHub raw URL
+ */
+async function fetchGitHubRawContent(
+  repo: string,
+  path: string
+): Promise<string | null> {
+  const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
+  try {
+    const response = await fetch(url, {
+      headers: {
+        Accept: "text/plain",
+        "User-Agent": "@ebowwa/markdown-docs-scraper",
+      },
+    });
+    if (!response.ok) {
+      return null;
+    }
+    return await response.text();
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error);
+    return null;
+  }
+}
+/**
+ * Extract title from markdown content
+ */
+function extractTitle(markdown: string): string | null {
+  const titleMatch = markdown.match(/^#\s+(.+)$/m);
+  return titleMatch ? titleMatch[1].trim() : null;
+}
+/**
+ * Save file to disk
+ */
+async function saveFile(
+  outputDir: string,
+  filename: string,
+  content: string
+): Promise<void> {
+  const fs = await import("fs/promises");
+  const path = await import("path");
+  const outputPath = path.join(outputDir, filename);
+  await fs.mkdir(path.dirname(outputPath), { recursive: true });
+  await fs.writeFile(outputPath, content, "utf-8");
+}