@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Scrapers Module
3
+ *
4
+ * Composable scraper architecture for multiple documentation source types.
5
+ * This module provides a registry-based system for different scraper implementations.
6
+ */
7
+
8
+ // Types
9
+ export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
10
+
11
+ // Scrapers
12
+ export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
13
+ export { githubRawScraper } from "./github-raw";
14
+
15
+ // Registry
16
+ export { registerScraper, getScraper, scrapeSource } from "./registry";
@@ -0,0 +1,101 @@
1
+ /**
2
+ * LLMS-TXT Scraper
3
+ *
4
+ * Scrapes documentation sites that provide llms.txt index files.
5
+ * Uses the core MarkdownDocsScraper under the hood.
6
+ */
7
+
8
+ import { scrapeMarkdownDocs, type DocPage } from "../index";
9
+ import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
10
+
11
+ // ============================================================================
12
+ // URL PATTERNS
13
+ // ============================================================================
14
+
15
+ /** Pattern for Claude Code docs: /docs/en/page.md */
16
+ export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
17
+
18
+ /** Pattern for generic docs: any domain/path.md */
19
+ export const GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
20
+
21
+ // ============================================================================
22
+ // LLMS-TXT SCRAPER
23
+ // ============================================================================
24
+
25
+ export const llmsTxtScraper: Scraper = {
26
+ type: "llms-txt",
27
+
28
+ async scrape(config: SourceConfig): Promise<ScrapeResult> {
29
+ const options = getScraperOptions(config);
30
+ const result = await scrapeMarkdownDocs(options);
31
+
32
+ // Convert DocPage[] to DownloadResult[]
33
+ const downloaded: DownloadResult[] = result.downloaded.map((page: DocPage) => {
34
+ const category = page.category || "";
35
+ const filename = `${page.pageName || "untitled"}.md`;
36
+ const path = category ? `${category}/${filename}` : filename;
37
+
38
+ return {
39
+ success: true,
40
+ path,
41
+ title: page.title,
42
+ };
43
+ });
44
+
45
+ return {
46
+ downloaded,
47
+ failed: result.failed,
48
+ duration: result.duration,
49
+ };
50
+ },
51
+ };
52
+
53
+ // ============================================================================
54
+ // OPTIONS BUILDER
55
+ // ============================================================================
56
+
57
+ /**
58
+ * Get scraper options based on source configuration
59
+ */
60
+ function getScraperOptions(config: SourceConfig) {
61
+ const baseOptions = {
62
+ baseUrl: config.baseUrl,
63
+ docsPath: config.docsPath,
64
+ outputDir: config.outputDir,
65
+ concurrency: 10,
66
+ useLlms: true,
67
+ tryDocsSubdomain: false,
68
+ };
69
+
70
+ // Source-specific options
71
+ if (config.name === "Claude Code") {
72
+ return {
73
+ ...baseOptions,
74
+ llmsPaths: ["/docs/llms.txt"],
75
+ linkPattern: CLAUDE_CODE_PATTERN,
76
+ };
77
+ }
78
+
79
+ if (config.name === "Polymarket") {
80
+ return {
81
+ ...baseOptions,
82
+ llmsPaths: ["/llms.txt"],
83
+ linkPattern: GENERIC_PATTERN,
84
+ };
85
+ }
86
+
87
+ if (config.name === "Bun") {
88
+ return {
89
+ ...baseOptions,
90
+ llmsPaths: ["/docs/llms.txt", "/llms.txt"],
91
+ linkPattern: GENERIC_PATTERN,
92
+ };
93
+ }
94
+
95
+ // Default: use provided llmsTxtPath or try common paths
96
+ return {
97
+ ...baseOptions,
98
+ llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
99
+ linkPattern: config.linkPattern || GENERIC_PATTERN,
100
+ };
101
+ }
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Scraper Registry
3
+ *
4
+ * Maps source types to scraper implementations.
5
+ * Allows registering new scrapers and looking them up by type.
6
+ */
7
+
8
+ import type { Scraper, SourceType, SourceConfig, ScrapeResult } from "./types";
9
+ import { llmsTxtScraper } from "./llms-txt";
10
+ import { githubRawScraper } from "./github-raw";
11
+
12
+ // ============================================================================
13
+ // SCRAPER REGISTRY
14
+ // ============================================================================
15
+
16
+ /** Registry of all available scrapers keyed by type */
17
+ const scrapers: Map<SourceType, Scraper> = new Map();
18
+
19
+ /**
20
+ * Register a scraper implementation
21
+ */
22
+ export function registerScraper(scraper: Scraper): void {
23
+ scrapers.set(scraper.type, scraper);
24
+ }
25
+
26
+ /**
27
+ * Get a scraper by type
28
+ */
29
+ export function getScraper(type: SourceType): Scraper | undefined {
30
+ return scrapers.get(type);
31
+ }
32
+
33
+ /**
34
+ * Scrape a source using the appropriate scraper
35
+ */
36
+ export async function scrapeSource(config: SourceConfig): Promise<ScrapeResult> {
37
+ const scraper = scrapers.get(config.sourceType);
38
+
39
+ if (!scraper) {
40
+ throw new Error(`No scraper registered for type: ${config.sourceType}`);
41
+ }
42
+
43
+ return scraper.scrape(config);
44
+ }
45
+
46
+ // ============================================================================
47
+ // DEFAULT REGISTRATIONS
48
+ // ============================================================================
49
+
50
+ // Register built-in scrapers
51
+ registerScraper(llmsTxtScraper);
52
+ registerScraper(githubRawScraper);
53
+
54
+ // Export scrapers for direct access if needed
55
+ export { llmsTxtScraper, githubRawScraper };
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Scraper Types
3
+ *
4
+ * Core types for the composable scraper architecture.
5
+ * These types define the interface that all scrapers must implement.
6
+ */
7
+
8
+ // ============================================================================
9
+ // SOURCE TYPES
10
+ // ============================================================================
11
+
12
+ /** Supported documentation source types */
13
+ export type SourceType = "llms-txt" | "github-raw";
14
+
15
+ // ============================================================================
16
+ // SCRAPER INTERFACE
17
+ // ============================================================================
18
+
19
+ /** Result from scraping a source */
20
+ export interface ScrapeResult {
21
+ downloaded: DownloadResult[];
22
+ failed: Array<{ url: string; error: string }>;
23
+ duration?: number;
24
+ }
25
+
26
+ /** Individual download result */
27
+ export interface DownloadResult {
28
+ success: boolean;
29
+ path: string;
30
+ title?: string;
31
+ }
32
+
33
+ /** Scraper interface - all scrapers must implement this */
34
+ export interface Scraper {
35
+ /** Source type identifier */
36
+ type: SourceType;
37
+
38
+ /** Scrape documentation from a source */
39
+ scrape(config: SourceConfig): Promise<ScrapeResult>;
40
+ }
41
+
42
+ // ============================================================================
43
+ // SOURCE CONFIG
44
+ // ============================================================================
45
+
46
+ /** Source configuration */
47
+ export interface SourceConfig {
48
+ /** Display name */
49
+ name: string;
50
+
51
+ /** Source type - determines which scraper to use */
52
+ sourceType: SourceType;
53
+
54
+ /** Base URL for the documentation */
55
+ baseUrl: string;
56
+
57
+ /** Path to docs (e.g., /docs, /docs/en) */
58
+ docsPath: string;
59
+
60
+ /** Output directory for downloaded docs */
61
+ outputDir: string;
62
+
63
+ /** Output directory for daily reports */
64
+ reportDir: string;
65
+
66
+ /** llms.txt path (for llms-txt sources) */
67
+ llmsTxtPath?: string;
68
+
69
+ /** Custom link pattern for llms.txt parsing */
70
+ linkPattern?: RegExp;
71
+
72
+ /** GitHub config (for github-raw sources or GitHub API data) */
73
+ github?: {
74
+ repo: string;
75
+ includeCommits: boolean;
76
+ includeReleases: boolean;
77
+ includePRs: boolean;
78
+ };
79
+ }