@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -161,6 +161,88 @@ Downloaded: 2026-02-06T00:00:00.000Z
161
161
  Original markdown content...
162
162
  ```
163
163
 
164
+ ## Composable Scrapers Module
165
+
166
+ The package includes a composable scraper architecture for multiple documentation source types.
167
+
168
+ ### Usage
169
+
170
+ ```typescript
171
+ import {
172
+ scrapeSource,
173
+ registerScraper,
174
+ llmsTxtScraper,
175
+ githubRawScraper,
176
+ type SourceConfig,
177
+ } from "@ebowwa/markdown-docs-scraper/scrapers";
178
+
179
+ // Configure a source
180
+ const config: SourceConfig = {
181
+ name: "My Docs",
182
+ sourceType: "llms-txt",
183
+ baseUrl: "https://docs.example.com",
184
+ docsPath: "/docs",
185
+ outputDir: "./docs/my-docs",
186
+ reportDir: "./reports/my-docs",
187
+ };
188
+
189
+ // Scrape using the registry (auto-selects scraper by sourceType)
190
+ const result = await scrapeSource(config);
191
+ ```
192
+
193
+ ### Built-in Scrapers
194
+
195
+ - **llms-txt**: Scrapes docs sites with llms.txt index files
196
+ - **github-raw**: Downloads markdown directly from GitHub repos
197
+
198
+ ### Custom Scrapers
199
+
200
+ ```typescript
201
+ import { registerScraper, type Scraper, type SourceType } from "@ebowwa/markdown-docs-scraper/scrapers";
202
+
203
+ const myScraper: Scraper = {
204
+ type: "my-type" as SourceType,
205
+ async scrape(config) {
206
+ // Custom scraping logic
207
+ return {
208
+ downloaded: [],
209
+ failed: [],
210
+ duration: 0,
211
+ };
212
+ },
213
+ };
214
+
215
+ registerScraper(myScraper);
216
+ ```
217
+
218
+ ### Types
219
+
220
+ ```typescript
221
+ type SourceType = "llms-txt" | "github-raw";
222
+
223
+ interface SourceConfig {
224
+ name: string;
225
+ sourceType: SourceType;
226
+ baseUrl: string;
227
+ docsPath: string;
228
+ outputDir: string;
229
+ reportDir: string;
230
+ llmsTxtPath?: string;
231
+ linkPattern?: RegExp;
232
+ github?: {
233
+ repo: string;
234
+ includeCommits: boolean;
235
+ includeReleases: boolean;
236
+ includePRs: boolean;
237
+ };
238
+ }
239
+
240
+ interface Scraper {
241
+ type: SourceType;
242
+ scrape(config: SourceConfig): Promise<ScrapeResult>;
243
+ }
244
+ ```
245
+
164
246
  ## License
165
247
 
166
248
  MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * CLI for @ebowwa/markdown-docs-scraper
4
+ */
5
+ export {};
6
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;GAEG"}
package/dist/cli.js CHANGED
@@ -2426,6 +2426,7 @@ program.command("scrape").description("Scrape documentation from a URL").require
2426
2426
  }
2427
2427
  });
2428
2428
  program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
2429
+ console.log("DEBUG CLI: Options received:", options);
2429
2430
  const scraper = new import__2.MarkdownDocsScraper({
2430
2431
  baseUrl: options.url,
2431
2432
  docsPath: options.docsPath,
@@ -0,0 +1,128 @@
1
+ /**
2
+ * @ebowwa/markdown-docs-scraper
3
+ *
4
+ * Composable markdown documentation scraper.
5
+ * - Configurable llms.txt paths with fallbacks
6
+ * - Custom URL patterns for different doc sites
7
+ * - Works with any markdown documentation site
8
+ * - Uses full URLs from llms.txt directly
9
+ */
10
+ export interface DocPage {
11
+ url: string;
12
+ title: string;
13
+ content: string;
14
+ category?: string;
15
+ pageName?: string;
16
+ }
17
+ export interface ScraperOptions {
18
+ baseUrl: string;
19
+ docsPath?: string;
20
+ categories?: Record<string, string[]>;
21
+ outputDir?: string;
22
+ concurrency?: number;
23
+ onProgress?: (current: number, total: number) => void;
24
+ /** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
25
+ llmsPaths?: string[];
26
+ /** Also try docs subdomain variants (e.g., docs.example.com) */
27
+ tryDocsSubdomain?: boolean;
28
+ /** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
29
+ linkPattern?: RegExp;
30
+ /** Use full URLs from llms.txt directly (default: true for generic pattern) */
31
+ useDirectUrls?: boolean;
32
+ }
33
+ export interface ScraperResult {
34
+ downloaded: DocPage[];
35
+ failed: Array<{
36
+ url: string;
37
+ error: string;
38
+ }>;
39
+ duration: number;
40
+ }
41
+ /** Discovered page with full URL */
42
+ interface DiscoveredPage {
43
+ category: string;
44
+ page: string;
45
+ fullUrl: string;
46
+ }
47
+ /** Extract title from markdown content */
48
+ export declare function extractTitle(markdown: string): string;
49
+ /** Parse page path into category and page name */
50
+ export declare function parsePagePath(pagePath: string): {
51
+ category: string;
52
+ page: string;
53
+ };
54
+ /** Fetch markdown content from URL */
55
+ export declare function fetchMarkdown(url: string, userAgent?: string): Promise<string | null>;
56
+ export declare class MarkdownDocsScraper {
57
+ private options;
58
+ constructor(options: ScraperOptions);
59
+ /**
60
+ * Build URL for a documentation page (fallback when no direct URL)
61
+ */
62
+ buildUrl(category: string, page: string): string;
63
+ /**
64
+ * Download a page using either direct URL or built URL
65
+ */
66
+ downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null>;
67
+ /**
68
+ * Generate possible llms.txt URLs to try
69
+ */
70
+ private getLlmsUrls;
71
+ /**
72
+ * Fetch llms.txt from multiple possible URLs with fallback
73
+ */
74
+ private fetchLlmsTxt;
75
+ /**
76
+ * Discover pages from llms.txt index
77
+ */
78
+ discoverPages(): Promise<DiscoveredPage[]>;
79
+ /**
80
+ * Scrape pages discovered from llms.txt
81
+ */
82
+ scrapeFromLlms(): Promise<ScraperResult>;
83
+ /**
84
+ * Scrape all documentation pages (uses categories)
85
+ */
86
+ scrape(): Promise<ScraperResult>;
87
+ /**
88
+ * Extract body content from a file (strips header comment)
89
+ */
90
+ private extractBody;
91
+ /**
92
+ * Save scraped pages to disk (only writes if content changed)
93
+ */
94
+ savePages(pages: DocPage[]): Promise<{
95
+ updated: number;
96
+ skipped: number;
97
+ }>;
98
+ /**
99
+ * Get list of pages to scrape based on categories
100
+ */
101
+ private getPagesToScrape;
102
+ }
103
+ /**
104
+ * Scrape markdown documentation with a single function call
105
+ */
106
+ export declare function scrapeMarkdownDocs(options: ScraperOptions & {
107
+ useLlms?: boolean;
108
+ }): Promise<ScraperResult & {
109
+ saveStats?: {
110
+ updated: number;
111
+ skipped: number;
112
+ };
113
+ }>;
114
+ /** Pattern for Claude Code docs: /docs/en/page.md */
115
+ export declare const CLAUDE_CODE_PATTERN: RegExp;
116
+ /** Pattern for generic docs: any domain/path.md */
117
+ export declare const GENERIC_PATTERN: RegExp;
118
+ /** Create scraper options for Claude Code docs */
119
+ export declare function claudeCodeOptions(outputDir: string): ScraperOptions;
120
+ /** Create scraper options for Polymarket docs */
121
+ export declare function polymarketOptions(outputDir: string): ScraperOptions;
122
+ /**
123
+ * Re-export scrapers module for composable scraper architecture.
124
+ * This provides a registry-based system for different scraper implementations.
125
+ */
126
+ export { type SourceType, type SourceConfig, type Scraper, type ScrapeResult as ScraperModuleResult, type DownloadResult, llmsTxtScraper, githubRawScraper, CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN, GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN, registerScraper, getScraper, scrapeSource, } from "./scrapers/index";
127
+ export default MarkdownDocsScraper;
128
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAMH,MAAM,WAAW,OAAO;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IACtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACtD,8EAA8E;IAC9E,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,gEAAgE;IAChE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,6GAA6G;IAC7G,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+EAA+E;IAC/E,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,OAAO,EAAE,CAAC;IACtB,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9C,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,oCAAoC;AACpC,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AAYD,0CAA0C;AAC1C,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGrD;AAED,kDAAkD;AAClD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAkBlF;AAED,sCAAsC;AACtC,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAkC,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAkBpH;AAMD,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAA2B;gBAE9B,OAAO,EAAE,cAAc;IAenC;;OAEG;IACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM;IAUhD;;OAEG;IACG,YAAY,CAAC,QAAQ,EAAE,cAAc,GAAG,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAqBrE;;OAEG;IACH,OAAO,CAAC,WAAW;IA6BnB;;OAEG;YACW,YAAY;IA6B1B;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;IAuChD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,aAAa,CAAC;IA+C9C;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,CAAC;IAwCtC;;OAEG;IACH,OAAO,CAAC,WAAW;IAMnB;;OAEG;IACG,SAAS,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC;IAyChF;;OAEG;IACH,OAAO,CAAC,gBAAgB;CAWzB;AAMD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,cAAc,GAAG;IAAE,OAAO,CAAC,EAAE,OAAO,CAAA;CAAE,GAC9C,OAAO,CAAC,aAAa,GAAG;IAAE,SAAS,CAAC,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC,CAe/E;AAMD,qDAAqD;AACrD,eAAO,MAAM,mBAAmB,QAAiE,CAAC;AAElG,mDAAmD;AACnD,eAAO,MAAM,eAAe,QAAuB,CAAC;AAEpD,kDAAkD;AAClD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAED,iDAAiD;AACjD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAMD;;;GAGG;AACH,OAAO,EAEL,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,OAAO,EACZ,KAAK,YAAY,IAAI,mBAAmB,EACxC,KAAK,cAAc,EAGnB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,IAAI,2BAA2B,EAClD,eAAe,IAAI,uBAAuB,EAG1C,eAAe,EACf,UAAU,EACV,YAAY,GACb,MAAM,kBAAkB,CAAC;AAM1B,eAAe,mBAAmB,CAAC"}
package/dist/index.js CHANGED
@@ -17,9 +17,164 @@ var __toESM = (mod, isNodeMode, target) => {
17
17
  };
18
18
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
19
19
 
20
- // src/index.ts
21
- var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
20
+ // src/scrapers/llms-txt.ts
21
+ var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
22
+ var GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
23
+ var llmsTxtScraper = {
24
+ type: "llms-txt",
25
+ async scrape(config) {
26
+ const options = getScraperOptions(config);
27
+ const result = await scrapeMarkdownDocs(options);
28
+ const downloaded = result.downloaded.map((page) => {
29
+ const category = page.category || "";
30
+ const filename = `${page.pageName || "untitled"}.md`;
31
+ const path = category ? `${category}/${filename}` : filename;
32
+ return {
33
+ success: true,
34
+ path,
35
+ title: page.title
36
+ };
37
+ });
38
+ return {
39
+ downloaded,
40
+ failed: result.failed,
41
+ duration: result.duration
42
+ };
43
+ }
44
+ };
45
+ function getScraperOptions(config) {
46
+ const baseOptions = {
47
+ baseUrl: config.baseUrl,
48
+ docsPath: config.docsPath,
49
+ outputDir: config.outputDir,
50
+ concurrency: 10,
51
+ useLlms: true,
52
+ tryDocsSubdomain: false
53
+ };
54
+ if (config.name === "Claude Code") {
55
+ return {
56
+ ...baseOptions,
57
+ llmsPaths: ["/docs/llms.txt"],
58
+ linkPattern: CLAUDE_CODE_PATTERN
59
+ };
60
+ }
61
+ if (config.name === "Polymarket") {
62
+ return {
63
+ ...baseOptions,
64
+ llmsPaths: ["/llms.txt"],
65
+ linkPattern: GENERIC_PATTERN
66
+ };
67
+ }
68
+ if (config.name === "Bun") {
69
+ return {
70
+ ...baseOptions,
71
+ llmsPaths: ["/docs/llms.txt", "/llms.txt"],
72
+ linkPattern: GENERIC_PATTERN
73
+ };
74
+ }
75
+ return {
76
+ ...baseOptions,
77
+ llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
78
+ linkPattern: config.linkPattern || GENERIC_PATTERN
79
+ };
80
+ }
81
+ // src/scrapers/github-raw.ts
82
+ var githubRawScraper = {
83
+ type: "github-raw",
84
+ async scrape(config) {
85
+ const startTime = Date.now();
86
+ const downloaded = [];
87
+ const failed = [];
88
+ if (!config.github?.repo) {
89
+ throw new Error(`GitHub source "${config.name}" missing github.repo config`);
90
+ }
91
+ const files = await fetchGitHubMarkdownFiles(config.github.repo, config.docsPath.replace(/^\//, ""));
92
+ for (const file of files) {
93
+ const content = await fetchGitHubRawContent(config.github.repo, file.path);
94
+ if (content) {
95
+ downloaded.push({
96
+ success: true,
97
+ path: file.name,
98
+ title: extractTitle(content) || file.name.replace(".md", "")
99
+ });
100
+ await saveFile(config.outputDir, file.name, content);
101
+ } else {
102
+ failed.push({
103
+ url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
104
+ error: "Failed to fetch content"
105
+ });
106
+ }
107
+ }
108
+ return {
109
+ downloaded,
110
+ failed,
111
+ duration: Date.now() - startTime
112
+ };
113
+ }
114
+ };
115
+ async function fetchGitHubMarkdownFiles(repo, path) {
116
+ const url = `https://api.github.com/repos/${repo}/contents/${path}`;
117
+ const response = await fetch(url, {
118
+ headers: {
119
+ Accept: "application/vnd.github.v3+json",
120
+ "User-Agent": "@ebowwa/markdown-docs-scraper"
121
+ }
122
+ });
123
+ if (!response.ok) {
124
+ throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
125
+ }
126
+ const contents = await response.json();
127
+ return contents.filter((item) => item.type === "file" && item.name.endsWith(".md"));
128
+ }
129
+ async function fetchGitHubRawContent(repo, path) {
130
+ const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
131
+ try {
132
+ const response = await fetch(url, {
133
+ headers: {
134
+ Accept: "text/plain",
135
+ "User-Agent": "@ebowwa/markdown-docs-scraper"
136
+ }
137
+ });
138
+ if (!response.ok) {
139
+ return null;
140
+ }
141
+ return await response.text();
142
+ } catch (error) {
143
+ console.error(`Error fetching ${url}:`, error);
144
+ return null;
145
+ }
146
+ }
22
147
  function extractTitle(markdown) {
148
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
149
+ return titleMatch ? titleMatch[1].trim() : null;
150
+ }
151
+ async function saveFile(outputDir, filename, content) {
152
+ const fs = await import("fs/promises");
153
+ const path = await import("path");
154
+ const outputPath = path.join(outputDir, filename);
155
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
156
+ await fs.writeFile(outputPath, content, "utf-8");
157
+ }
158
+ // src/scrapers/registry.ts
159
+ var scrapers = new Map;
160
+ function registerScraper(scraper) {
161
+ scrapers.set(scraper.type, scraper);
162
+ }
163
+ function getScraper(type) {
164
+ return scrapers.get(type);
165
+ }
166
+ async function scrapeSource(config) {
167
+ const scraper = scrapers.get(config.sourceType);
168
+ if (!scraper) {
169
+ throw new Error(`No scraper registered for type: ${config.sourceType}`);
170
+ }
171
+ return scraper.scrape(config);
172
+ }
173
+ registerScraper(llmsTxtScraper);
174
+ registerScraper(githubRawScraper);
175
+ // src/index.ts
176
+ var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
177
+ function extractTitle2(markdown) {
23
178
  const titleMatch = markdown.match(/^#\s+(.+)$/m);
24
179
  return titleMatch ? titleMatch[1].trim() : "Untitled";
25
180
  }
@@ -67,7 +222,8 @@ class MarkdownDocsScraper {
67
222
  onProgress: options.onProgress || (() => {}),
68
223
  llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
69
224
  tryDocsSubdomain: options.tryDocsSubdomain ?? true,
70
- linkPattern: options.linkPattern || GENERIC_LINK_PATTERN
225
+ linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
226
+ useDirectUrls: options.useDirectUrls ?? true
71
227
  };
72
228
  }
73
229
  buildUrl(category, page) {
@@ -79,18 +235,18 @@ class MarkdownDocsScraper {
79
235
  return `${this.options.baseUrl}/${page}.md`;
80
236
  }
81
237
  }
82
- async downloadPage(category, page) {
83
- const url = this.buildUrl(category, page);
238
+ async downloadPage(pageInfo) {
239
+ const url = this.options.useDirectUrls && pageInfo.fullUrl ? pageInfo.fullUrl : this.buildUrl(pageInfo.category, pageInfo.page);
84
240
  const content = await fetchMarkdown(url);
85
241
  if (!content) {
86
242
  return null;
87
243
  }
88
244
  return {
89
245
  url,
90
- title: extractTitle(content),
246
+ title: extractTitle2(content),
91
247
  content,
92
- category,
93
- pageName: page
248
+ category: pageInfo.category,
249
+ pageName: pageInfo.page
94
250
  };
95
251
  }
96
252
  getLlmsUrls() {
@@ -114,20 +270,24 @@ class MarkdownDocsScraper {
114
270
  }
115
271
  async fetchLlmsTxt() {
116
272
  const urls = this.getLlmsUrls();
273
+ console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
117
274
  for (const llmsUrl of urls) {
118
275
  try {
276
+ console.log(`DEBUG: Fetching ${llmsUrl}...`);
119
277
  const response = await fetch(llmsUrl, {
120
278
  headers: {
121
279
  Accept: "text/plain",
122
280
  "User-Agent": "@ebowwa/markdown-docs-scraper"
123
281
  }
124
282
  });
283
+ console.log(`DEBUG: Response status: ${response.status}`);
125
284
  if (response.ok) {
126
285
  const content = await response.text();
127
286
  console.log(`Found llms.txt at ${llmsUrl}`);
128
287
  return { content, url: llmsUrl };
129
288
  }
130
289
  } catch (error) {
290
+ console.log(`DEBUG: Error: ${error}`);
131
291
  continue;
132
292
  }
133
293
  }
@@ -143,13 +303,16 @@ class MarkdownDocsScraper {
143
303
  return pages;
144
304
  }
145
305
  const { content } = llmsResult;
146
- const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
306
+ const pattern = this.options.linkPattern;
307
+ const regex = new RegExp(pattern.source, pattern.flags);
147
308
  let match;
309
+ console.log(`DEBUG: Using pattern: ${pattern.source}`);
310
+ console.log(`DEBUG: Content length: ${content.length}`);
148
311
  while ((match = regex.exec(content)) !== null) {
149
- const url = match[2];
312
+ const fullUrl = match[2];
150
313
  const pagePath = match[3];
151
314
  const { category, page } = parsePagePath(pagePath);
152
- pages.push({ category, page });
315
+ pages.push({ category, page, fullUrl });
153
316
  }
154
317
  console.log(`Discovered ${pages.length} pages from llms.txt`);
155
318
  } catch (error) {
@@ -169,14 +332,15 @@ class MarkdownDocsScraper {
169
332
  console.log(`Scraping ${pages.length} discovered pages...`);
170
333
  for (let i = 0;i < pages.length; i += this.options.concurrency) {
171
334
  const batch = pages.slice(i, i + this.options.concurrency);
172
- const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
335
+ const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page)));
173
336
  results.forEach((result, index) => {
174
337
  const page = batch[index];
175
338
  if (result.status === "fulfilled" && result.value) {
176
339
  downloaded.push(result.value);
177
340
  } else {
341
+ const url = this.options.useDirectUrls && page.fullUrl ? page.fullUrl : this.buildUrl(page.category, page.page);
178
342
  failed.push({
179
- url: this.buildUrl(page.category, page.page),
343
+ url,
180
344
  error: result.status === "rejected" ? result.reason : "Not found"
181
345
  });
182
346
  }
@@ -198,7 +362,7 @@ class MarkdownDocsScraper {
198
362
  console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
199
363
  for (let i = 0;i < pages.length; i += this.options.concurrency) {
200
364
  const batch = pages.slice(i, i + this.options.concurrency);
201
- const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
365
+ const results = await Promise.allSettled(batch.map((page) => this.downloadPage({ ...page, fullUrl: "" })));
202
366
  results.forEach((result, index) => {
203
367
  const page = batch[index];
204
368
  if (result.status === "fulfilled" && result.value) {
@@ -218,14 +382,28 @@ class MarkdownDocsScraper {
218
382
  console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
219
383
  return { downloaded, failed, duration };
220
384
  }
385
+ extractBody(content) {
386
+ const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
387
+ return content.replace(headerRegex, "");
388
+ }
221
389
  async savePages(pages) {
222
390
  const fs = await import("fs/promises");
223
391
  const path = await import("path");
392
+ let updated = 0;
393
+ let skipped = 0;
224
394
  for (const page of pages) {
225
395
  const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
226
396
  const dir = page.category ? path.join(this.options.outputDir, page.category) : this.options.outputDir;
227
397
  await fs.mkdir(dir, { recursive: true });
228
398
  const filepath = path.join(dir, `${nameToUse}.md`);
399
+ try {
400
+ const existingContent = await fs.readFile(filepath, "utf-8");
401
+ const existingBody = this.extractBody(existingContent);
402
+ if (existingBody === page.content) {
403
+ skipped++;
404
+ continue;
405
+ }
406
+ } catch {}
229
407
  const header = `<!--
230
408
  Source: ${page.url}
231
409
  Downloaded: ${new Date().toISOString()}
@@ -233,13 +411,15 @@ Downloaded: ${new Date().toISOString()}
233
411
 
234
412
  `;
235
413
  await fs.writeFile(filepath, header + page.content, "utf-8");
414
+ updated++;
236
415
  }
416
+ return { updated, skipped };
237
417
  }
238
418
  getPagesToScrape() {
239
419
  const pages = [];
240
420
  for (const [category, pageList] of Object.entries(this.options.categories)) {
241
421
  for (const page of pageList) {
242
- pages.push({ category, page });
422
+ pages.push({ category, page, fullUrl: "" });
243
423
  }
244
424
  }
245
425
  return pages;
@@ -248,22 +428,27 @@ Downloaded: ${new Date().toISOString()}
248
428
  async function scrapeMarkdownDocs(options) {
249
429
  const scraper = new MarkdownDocsScraper(options);
250
430
  const result = options.useLlms ? await scraper.scrapeFromLlms() : await scraper.scrape();
431
+ let saveStats;
251
432
  if (options.outputDir) {
252
- await scraper.savePages(result.downloaded);
433
+ saveStats = await scraper.savePages(result.downloaded);
434
+ if (saveStats.updated > 0 || saveStats.skipped > 0) {
435
+ console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
436
+ }
253
437
  }
254
- return result;
438
+ return { ...result, saveStats };
255
439
  }
256
- var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
257
- var GENERIC_PATTERN = GENERIC_LINK_PATTERN;
440
+ var CLAUDE_CODE_PATTERN2 = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
441
+ var GENERIC_PATTERN2 = GENERIC_LINK_PATTERN;
258
442
  function claudeCodeOptions(outputDir) {
259
443
  return {
260
444
  baseUrl: "https://code.claude.com",
261
445
  docsPath: "/docs/en",
262
446
  llmsPaths: ["/docs/llms.txt"],
263
- linkPattern: CLAUDE_CODE_PATTERN,
447
+ linkPattern: CLAUDE_CODE_PATTERN2,
264
448
  outputDir,
265
449
  concurrency: 10,
266
- tryDocsSubdomain: false
450
+ tryDocsSubdomain: false,
451
+ useDirectUrls: false
267
452
  };
268
453
  }
269
454
  function polymarketOptions(outputDir) {
@@ -271,22 +456,30 @@ function polymarketOptions(outputDir) {
271
456
  baseUrl: "https://docs.polymarket.com",
272
457
  docsPath: "",
273
458
  llmsPaths: ["/llms.txt"],
274
- linkPattern: GENERIC_PATTERN,
459
+ linkPattern: GENERIC_PATTERN2,
275
460
  outputDir,
276
461
  concurrency: 10,
277
- tryDocsSubdomain: false
462
+ tryDocsSubdomain: false,
463
+ useDirectUrls: true
278
464
  };
279
465
  }
280
466
  var src_default = MarkdownDocsScraper;
281
467
  export {
468
+ scrapeSource,
282
469
  scrapeMarkdownDocs,
470
+ registerScraper,
283
471
  polymarketOptions,
284
472
  parsePagePath,
473
+ llmsTxtScraper,
474
+ githubRawScraper,
475
+ getScraper,
285
476
  fetchMarkdown,
286
- extractTitle,
477
+ extractTitle2 as extractTitle,
287
478
  src_default as default,
288
479
  claudeCodeOptions,
480
+ GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
481
+ CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
289
482
  MarkdownDocsScraper,
290
- GENERIC_PATTERN,
291
- CLAUDE_CODE_PATTERN
483
+ GENERIC_PATTERN2 as GENERIC_PATTERN,
484
+ CLAUDE_CODE_PATTERN2 as CLAUDE_CODE_PATTERN
292
485
  };
@@ -0,0 +1,9 @@
1
+ /**
2
+ * GitHub Raw Scraper
3
+ *
4
+ * Downloads markdown files directly from GitHub repositories via raw content URLs.
5
+ * Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
6
+ */
7
+ import type { Scraper } from "./types";
8
+ export declare const githubRawScraper: Scraper;
9
+ //# sourceMappingURL=github-raw.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github-raw.d.ts","sourceRoot":"","sources":["../../src/scrapers/github-raw.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,OAAO,EAA8C,MAAM,SAAS,CAAC;AAiBnF,eAAO,MAAM,gBAAgB,EAAE,OA6C9B,CAAC"}
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Scrapers Module
3
+ *
4
+ * Composable scraper architecture for multiple documentation source types.
5
+ * This module provides a registry-based system for different scraper implementations.
6
+ */
7
+ export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
8
+ export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
9
+ export { githubRawScraper } from "./github-raw";
10
+ export { registerScraper, getScraper, scrapeSource } from "./registry";
11
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scrapers/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,YAAY,EAAE,UAAU,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AAG/F,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAClF,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAGhD,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC"}