npm - @ebowwa/markdown-docs-scraper - Versions diffs - 1.1.0 → 1.2.1 - Mend

@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +82 -0
package/dist/cli.d.ts +6 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +1 -0
package/dist/index.d.ts +128 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +219 -26
package/dist/scrapers/github-raw.d.ts +9 -0
package/dist/scrapers/github-raw.d.ts.map +1 -0
package/dist/scrapers/index.d.ts +11 -0
package/dist/scrapers/index.d.ts.map +1 -0
package/dist/scrapers/index.js +448 -0
package/dist/scrapers/llms-txt.d.ts +13 -0
package/dist/scrapers/llms-txt.d.ts.map +1 -0
package/dist/scrapers/registry.d.ts +23 -0
package/dist/scrapers/registry.d.ts.map +1 -0
package/dist/scrapers/types.d.ts +57 -0
package/dist/scrapers/types.d.ts.map +1 -0
package/package.json +10 -2
package/src/cli.js +160 -0
package/src/cli.ts +2 -0
package/src/index.js +487 -0
package/src/index.ts +115 -28
package/src/scrapers/github-raw.ts +154 -0
package/src/scrapers/index.ts +16 -0
package/src/scrapers/llms-txt.ts +101 -0
package/src/scrapers/registry.ts +55 -0
package/src/scrapers/types.ts +79 -0

package/src/index.ts CHANGED Viewed

@@ -5,6 +5,7 @@
  * - Configurable llms.txt paths with fallbacks
  * - Custom URL patterns for different doc sites
  * - Works with any markdown documentation site
+ * - Uses full URLs from llms.txt directly
  */
 // ============================================================================
@@ -32,6 +33,8 @@ export interface ScraperOptions {
   tryDocsSubdomain?: boolean;
   /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
   linkPattern?: RegExp;
+  /** Use full URLs from llms.txt directly (default: true for generic pattern) */
+  useDirectUrls?: boolean;
 }
 export interface ScraperResult {
@@ -40,11 +43,18 @@ export interface ScraperResult {
   duration: number;
 }
+/** Discovered page with full URL */
+interface DiscoveredPage {
+  category: string;
+  page: string;
+  fullUrl: string;  // The complete URL from llms.txt
+}
 /** Default pattern: matches /docs/en/ or /docs/ paths */
 const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
-/** Generic pattern: matches any .md links in llms.txt */
-const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
+/** Generic pattern: matches any .md links - captures full path after domain */
+const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
 // ============================================================================
 // UTILITY FUNCTIONS (Composable)
@@ -116,11 +126,12 @@ export class MarkdownDocsScraper {
       llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
       tryDocsSubdomain: options.tryDocsSubdomain ?? true,
       linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
+      useDirectUrls: options.useDirectUrls ?? true,
     };
   }
   /**
-   * Build URL for a documentation page
+   * Build URL for a documentation page (fallback when no direct URL)
    */
   buildUrl(category: string, page: string): string {
     if (category) {
@@ -128,16 +139,19 @@ export class MarkdownDocsScraper {
     } else if (this.options.docsPath) {
       return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
     } else {
-      // No docsPath (like Polymarket) - direct path
       return `${this.options.baseUrl}/${page}.md`;
     }
   }
   /**
-   * Download a single documentation page
+   * Download a page using either direct URL or built URL
    */
-  async downloadPage(category: string, page: string): Promise<DocPage | null> {
-    const url = this.buildUrl(category, page);
+  async downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null> {
+    // Use direct URL if available and useDirectUrls is enabled
+    const url = (this.options.useDirectUrls && pageInfo.fullUrl)
+      ? pageInfo.fullUrl
+      : this.buildUrl(pageInfo.category, pageInfo.page);
     const content = await fetchMarkdown(url);
     if (!content) {
@@ -148,8 +162,8 @@ export class MarkdownDocsScraper {
       url,
       title: extractTitle(content),
       content,
-      category,
-      pageName: page,
+      category: pageInfo.category,
+      pageName: pageInfo.page,
     };
   }
@@ -173,7 +187,6 @@ export class MarkdownDocsScraper {
         // Skip if already on docs/doc subdomain
         if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
-          // Try docs.{domain}
           const docsDomain = hostname.replace(/^www\./, "");
           urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
           urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
@@ -191,9 +204,11 @@ export class MarkdownDocsScraper {
    */
   private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
     const urls = this.getLlmsUrls();
+    console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
     for (const llmsUrl of urls) {
       try {
+        console.log(`DEBUG: Fetching ${llmsUrl}...`);
         const response = await fetch(llmsUrl, {
           headers: {
             Accept: "text/plain",
@@ -201,13 +216,14 @@ export class MarkdownDocsScraper {
           },
         });
+        console.log(`DEBUG: Response status: ${response.status}`);
         if (response.ok) {
           const content = await response.text();
           console.log(`Found llms.txt at ${llmsUrl}`);
           return { content, url: llmsUrl };
         }
       } catch (error) {
-        // Try next URL
+        console.log(`DEBUG: Error: ${error}`);
         continue;
       }
     }
@@ -218,8 +234,8 @@ export class MarkdownDocsScraper {
   /**
    * Discover pages from llms.txt index
    */
-  async discoverPages(): Promise<Array<{ category: string; page: string }>> {
-    const pages: Array<{ category: string; page: string }> = [];
+  async discoverPages(): Promise<DiscoveredPage[]> {
+    const pages: DiscoveredPage[] = [];
     try {
       const llmsResult = await this.fetchLlmsTxt();
@@ -233,15 +249,20 @@ export class MarkdownDocsScraper {
       const { content } = llmsResult;
       // Use provided pattern or default
-      const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
+      const pattern = this.options.linkPattern;
+      const regex = new RegExp(pattern.source, pattern.flags);
       let match;
+      // Debug: log pattern being used
+      console.log(`DEBUG: Using pattern: ${pattern.source}`);
+      console.log(`DEBUG: Content length: ${content.length}`);
       while ((match = regex.exec(content)) !== null) {
-        const url = match[2];
+        const fullUrl = match[2];  // The full URL from llms.txt
         const pagePath = match[3]; // The captured path group
         const { category, page } = parsePagePath(pagePath);
-        pages.push({ category, page });
+        pages.push({ category, page, fullUrl });
       }
       console.log(`Discovered ${pages.length} pages from llms.txt`);
@@ -273,7 +294,7 @@ export class MarkdownDocsScraper {
     for (let i = 0; i < pages.length; i += this.options.concurrency) {
       const batch = pages.slice(i, i + this.options.concurrency);
       const results = await Promise.allSettled(
-        batch.map((page) => this.downloadPage(page.category, page.page))
+        batch.map((page) => this.downloadPage(page))
       );
       results.forEach((result, index) => {
@@ -281,8 +302,11 @@ export class MarkdownDocsScraper {
         if (result.status === "fulfilled" && result.value) {
           downloaded.push(result.value);
         } else {
+          const url = (this.options.useDirectUrls && page.fullUrl)
+            ? page.fullUrl
+            : this.buildUrl(page.category, page.page);
           failed.push({
-            url: this.buildUrl(page.category, page.page),
+            url,
             error: result.status === "rejected" ? (result.reason as string) : "Not found",
           });
         }
@@ -316,7 +340,7 @@ export class MarkdownDocsScraper {
     for (let i = 0; i < pages.length; i += this.options.concurrency) {
       const batch = pages.slice(i, i + this.options.concurrency);
       const results = await Promise.allSettled(
-        batch.map((page) => this.downloadPage(page.category, page.page))
+        batch.map((page) => this.downloadPage({ ...page, fullUrl: "" }))
       );
       results.forEach((result, index) => {
@@ -343,14 +367,25 @@ export class MarkdownDocsScraper {
   }
   /**
-   * Save scraped pages to disk
+   * Extract body content from a file (strips header comment)
+   */
+  private extractBody(content: string): string {
+    // Match header comment and remove it
+    const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
+    return content.replace(headerRegex, "");
+  }
+  /**
+   * Save scraped pages to disk (only writes if content changed)
    */
-  async savePages(pages: DocPage[]): Promise<void> {
+  async savePages(pages: DocPage[]): Promise<{ updated: number; skipped: number }> {
     const fs = await import("fs/promises");
     const path = await import("path");
+    let updated = 0;
+    let skipped = 0;
     for (const page of pages) {
-      // Use pageName if available, otherwise extract from URL
       const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
       const dir = page.category
@@ -361,20 +396,38 @@ export class MarkdownDocsScraper {
       const filepath = path.join(dir, `${nameToUse}.md`);
+      // Check if file exists and compare content
+      try {
+        const existingContent = await fs.readFile(filepath, "utf-8");
+        const existingBody = this.extractBody(existingContent);
+        // Skip if content unchanged
+        if (existingBody === page.content) {
+          skipped++;
+          continue;
+        }
+      } catch {
+        // File doesn't exist, will create it
+      }
+      // Content changed or new file - write it
       const header = `<!--\nSource: ${page.url}\nDownloaded: ${new Date().toISOString()}\n-->\n\n`;
       await fs.writeFile(filepath, header + page.content, "utf-8");
+      updated++;
     }
+    return { updated, skipped };
   }
   /**
    * Get list of pages to scrape based on categories
    */
-  private getPagesToScrape(): Array<{ category: string; page: string }> {
-    const pages: Array<{ category: string; page: string }> = [];
+  private getPagesToScrape(): DiscoveredPage[] {
+    const pages: DiscoveredPage[] = [];
     for (const [category, pageList] of Object.entries(this.options.categories)) {
       for (const page of pageList) {
-        pages.push({ category, page });
+        pages.push({ category, page, fullUrl: "" });
       }
     }
@@ -391,17 +444,21 @@ export class MarkdownDocsScraper {
  */
 export async function scrapeMarkdownDocs(
   options: ScraperOptions & { useLlms?: boolean }
-): Promise<ScraperResult> {
+): Promise<ScraperResult & { saveStats?: { updated: number; skipped: number } }> {
   const scraper = new MarkdownDocsScraper(options);
   const result = options.useLlms
     ? await scraper.scrapeFromLlms()
     : await scraper.scrape();
+  let saveStats;
   if (options.outputDir) {
-    await scraper.savePages(result.downloaded);
+    saveStats = await scraper.savePages(result.downloaded);
+    if (saveStats.updated > 0 || saveStats.skipped > 0) {
+      console.log(`  Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
+    }
   }
-  return result;
+  return { ...result, saveStats };
 }
 // ============================================================================
@@ -424,6 +481,7 @@ export function claudeCodeOptions(outputDir: string): ScraperOptions {
     outputDir,
     concurrency: 10,
     tryDocsSubdomain: false,
+    useDirectUrls: false,  // Claude Code can use built URLs
   };
 }
@@ -437,9 +495,38 @@ export function polymarketOptions(outputDir: string): ScraperOptions {
     outputDir,
     concurrency: 10,
     tryDocsSubdomain: false,
+    useDirectUrls: true,  // Polymarket needs direct URLs
   };
 }
+// ============================================================================
+// SCRAPERS MODULE
+// ============================================================================
+/**
+ * Re-export scrapers module for composable scraper architecture.
+ * This provides a registry-based system for different scraper implementations.
+ */
+export {
+  // Types
+  type SourceType,
+  type SourceConfig,
+  type Scraper,
+  type ScrapeResult as ScraperModuleResult,
+  type DownloadResult,
+  // Scrapers
+  llmsTxtScraper,
+  githubRawScraper,
+  CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
+  GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
+  // Registry
+  registerScraper,
+  getScraper,
+  scrapeSource,
+} from "./scrapers/index";
 // ============================================================================
 // EXPORTS
 // ============================================================================

package/src/scrapers/github-raw.ts ADDED Viewed

@@ -0,0 +1,154 @@
+/**
+ * GitHub Raw Scraper
+ *
+ * Downloads markdown files directly from GitHub repositories via raw content URLs.
+ * Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
+ */
+import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
+// ============================================================================
+// GITHUB API TYPES
+// ============================================================================
+interface GitHubContent {
+  name: string;
+  path: string;
+  download_url: string;
+  type: string;
+}
+// ============================================================================
+// GITHUB RAW SCRAPER
+// ============================================================================
+export const githubRawScraper: Scraper = {
+  type: "github-raw",
+  async scrape(config: SourceConfig): Promise<ScrapeResult> {
+    const startTime = Date.now();
+    const downloaded: DownloadResult[] = [];
+    const failed: Array<{ url: string; error: string }> = [];
+    if (!config.github?.repo) {
+      throw new Error(`GitHub source "${config.name}" missing github.repo config`);
+    }
+    // Get list of markdown files from GitHub API
+    const files = await fetchGitHubMarkdownFiles(
+      config.github.repo,
+      config.docsPath.replace(/^\//, "")
+    );
+    // Download each file
+    for (const file of files) {
+      const content = await fetchGitHubRawContent(config.github.repo, file.path);
+      if (content) {
+        downloaded.push({
+          success: true,
+          path: file.name,
+          title: extractTitle(content) || file.name.replace(".md", ""),
+        });
+        // Save the file
+        await saveFile(config.outputDir, file.name, content);
+      } else {
+        failed.push({
+          url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
+          error: "Failed to fetch content",
+        });
+      }
+    }
+    return {
+      downloaded,
+      failed,
+      duration: Date.now() - startTime,
+    };
+  },
+};
+// ============================================================================
+// GITHUB API FUNCTIONS
+// ============================================================================
+/**
+ * Fetch list of markdown files from GitHub repo directory
+ */
+async function fetchGitHubMarkdownFiles(
+  repo: string,
+  path: string
+): Promise<GitHubContent[]> {
+  const url = `https://api.github.com/repos/${repo}/contents/${path}`;
+  const response = await fetch(url, {
+    headers: {
+      Accept: "application/vnd.github.v3+json",
+      "User-Agent": "@ebowwa/markdown-docs-scraper",
+    },
+  });
+  if (!response.ok) {
+    throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
+  }
+  const contents: GitHubContent[] = await response.json();
+  // Filter for markdown files only
+  return contents.filter(
+    (item) => item.type === "file" && item.name.endsWith(".md")
+  );
+}
+/**
+ * Download markdown content from GitHub raw URL
+ */
+async function fetchGitHubRawContent(
+  repo: string,
+  path: string
+): Promise<string | null> {
+  const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
+  try {
+    const response = await fetch(url, {
+      headers: {
+        Accept: "text/plain",
+        "User-Agent": "@ebowwa/markdown-docs-scraper",
+      },
+    });
+    if (!response.ok) {
+      return null;
+    }
+    return await response.text();
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error);
+    return null;
+  }
+}
+/**
+ * Extract title from markdown content
+ */
+function extractTitle(markdown: string): string | null {
+  const titleMatch = markdown.match(/^#\s+(.+)$/m);
+  return titleMatch ? titleMatch[1].trim() : null;
+}
+/**
+ * Save file to disk
+ */
+async function saveFile(
+  outputDir: string,
+  filename: string,
+  content: string
+): Promise<void> {
+  const fs = await import("fs/promises");
+  const path = await import("path");
+  const outputPath = path.join(outputDir, filename);
+  await fs.mkdir(path.dirname(outputPath), { recursive: true });
+  await fs.writeFile(outputPath, content, "utf-8");
+}

package/src/scrapers/index.ts ADDED Viewed

@@ -0,0 +1,16 @@
+/**
+ * Scrapers Module
+ *
+ * Composable scraper architecture for multiple documentation source types.
+ * This module provides a registry-based system for different scraper implementations.
+ */
+// Types
+export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
+// Scrapers
+export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
+export { githubRawScraper } from "./github-raw";
+// Registry
+export { registerScraper, getScraper, scrapeSource } from "./registry";

package/src/scrapers/llms-txt.ts ADDED Viewed

@@ -0,0 +1,101 @@
+/**
+ * LLMS-TXT Scraper
+ *
+ * Scrapes documentation sites that provide llms.txt index files.
+ * Uses the core MarkdownDocsScraper under the hood.
+ */
+import { scrapeMarkdownDocs, type DocPage } from "../index";
+import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
+// ============================================================================
+// URL PATTERNS
+// ============================================================================
+/** Pattern for Claude Code docs: /docs/en/page.md */
+export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
+/** Pattern for generic docs: any domain/path.md */
+export const GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
+// ============================================================================
+// LLMS-TXT SCRAPER
+// ============================================================================
+export const llmsTxtScraper: Scraper = {
+  type: "llms-txt",
+  async scrape(config: SourceConfig): Promise<ScrapeResult> {
+    const options = getScraperOptions(config);
+    const result = await scrapeMarkdownDocs(options);
+    // Convert DocPage[] to DownloadResult[]
+    const downloaded: DownloadResult[] = result.downloaded.map((page: DocPage) => {
+      const category = page.category || "";
+      const filename = `${page.pageName || "untitled"}.md`;
+      const path = category ? `${category}/${filename}` : filename;
+      return {
+        success: true,
+        path,
+        title: page.title,
+      };
+    });
+    return {
+      downloaded,
+      failed: result.failed,
+      duration: result.duration,
+    };
+  },
+};
+// ============================================================================
+// OPTIONS BUILDER
+// ============================================================================
+/**
+ * Get scraper options based on source configuration
+ */
+function getScraperOptions(config: SourceConfig) {
+  const baseOptions = {
+    baseUrl: config.baseUrl,
+    docsPath: config.docsPath,
+    outputDir: config.outputDir,
+    concurrency: 10,
+    useLlms: true,
+    tryDocsSubdomain: false,
+  };
+  // Source-specific options
+  if (config.name === "Claude Code") {
+    return {
+      ...baseOptions,
+      llmsPaths: ["/docs/llms.txt"],
+      linkPattern: CLAUDE_CODE_PATTERN,
+    };
+  }
+  if (config.name === "Polymarket") {
+    return {
+      ...baseOptions,
+      llmsPaths: ["/llms.txt"],
+      linkPattern: GENERIC_PATTERN,
+    };
+  }
+  if (config.name === "Bun") {
+    return {
+      ...baseOptions,
+      llmsPaths: ["/docs/llms.txt", "/llms.txt"],
+      linkPattern: GENERIC_PATTERN,
+    };
+  }
+  // Default: use provided llmsTxtPath or try common paths
+  return {
+    ...baseOptions,
+    llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
+    linkPattern: config.linkPattern || GENERIC_PATTERN,
+  };
+}

package/src/scrapers/registry.ts ADDED Viewed

@@ -0,0 +1,55 @@
+/**
+ * Scraper Registry
+ *
+ * Maps source types to scraper implementations.
+ * Allows registering new scrapers and looking them up by type.
+ */
+import type { Scraper, SourceType, SourceConfig, ScrapeResult } from "./types";
+import { llmsTxtScraper } from "./llms-txt";
+import { githubRawScraper } from "./github-raw";
+// ============================================================================
+// SCRAPER REGISTRY
+// ============================================================================
+/** Registry of all available scrapers keyed by type */
+const scrapers: Map<SourceType, Scraper> = new Map();
+/**
+ * Register a scraper implementation
+ */
+export function registerScraper(scraper: Scraper): void {
+  scrapers.set(scraper.type, scraper);
+}
+/**
+ * Get a scraper by type
+ */
+export function getScraper(type: SourceType): Scraper | undefined {
+  return scrapers.get(type);
+}
+/**
+ * Scrape a source using the appropriate scraper
+ */
+export async function scrapeSource(config: SourceConfig): Promise<ScrapeResult> {
+  const scraper = scrapers.get(config.sourceType);
+  if (!scraper) {
+    throw new Error(`No scraper registered for type: ${config.sourceType}`);
+  }
+  return scraper.scrape(config);
+}
+// ============================================================================
+// DEFAULT REGISTRATIONS
+// ============================================================================
+// Register built-in scrapers
+registerScraper(llmsTxtScraper);
+registerScraper(githubRawScraper);
+// Export scrapers for direct access if needed
+export { llmsTxtScraper, githubRawScraper };