@ebowwa/markdown-docs-scraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,382 @@
1
+ /**
2
+ * @ebowwa/markdown-docs-scraper
3
+ *
4
+ * Scrape and mirror markdown-based documentation sites
5
+ */
6
+
7
+ // ============================================================================
8
+ // TYPES
9
+ // ============================================================================
10
+
11
+ export interface DocPage {
12
+ url: string;
13
+ title: string;
14
+ content: string;
15
+ category?: string;
16
+ pageName?: string; // The page filename (e.g., "agent-teams")
17
+ }
18
+
19
+ export interface ScraperOptions {
20
+ baseUrl: string;
21
+ docsPath?: string;
22
+ categories?: Record<string, string[]>;
23
+ outputDir?: string;
24
+ concurrency?: number;
25
+ onProgress?: (current: number, total: number) => void;
26
+ }
27
+
28
+ export interface ScraperResult {
29
+ downloaded: DocPage[];
30
+ failed: Array<{ url: string; error: string }>;
31
+ duration: number;
32
+ }
33
+
34
+ // ============================================================================
35
+ // SCRAPER
36
+ // ============================================================================
37
+
38
+ export class MarkdownDocsScraper {
39
+ private options: Required<ScraperOptions>;
40
+
41
+ constructor(options: ScraperOptions) {
42
+ this.options = {
43
+ baseUrl: options.baseUrl,
44
+ docsPath: options.docsPath || "/docs/en",
45
+ categories: options.categories || {},
46
+ outputDir: options.outputDir || "./docs",
47
+ concurrency: options.concurrency || 5,
48
+ onProgress: options.onProgress || (() => {}),
49
+ };
50
+ }
51
+
52
+ /**
53
+ * Fetch markdown content from a URL
54
+ */
55
+ async fetchMarkdown(url: string): Promise<string | null> {
56
+ try {
57
+ const response = await fetch(url, {
58
+ headers: {
59
+ Accept: "text/markdown, text/plain",
60
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
61
+ },
62
+ });
63
+
64
+ if (!response.ok) {
65
+ return null;
66
+ }
67
+
68
+ const contentType = response.headers.get("content-type") || "";
69
+ if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
70
+ // Try to parse anyway - some sites return incorrect content-type
71
+ }
72
+
73
+ return await response.text();
74
+ } catch (error) {
75
+ console.error(`Error fetching ${url}:`, error);
76
+ return null;
77
+ }
78
+ }
79
+
80
+ /**
81
+ * Extract title from markdown content
82
+ */
83
+ extractTitle(markdown: string): string {
84
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
85
+ return titleMatch ? titleMatch[1].trim() : "Untitled";
86
+ }
87
+
88
+ /**
89
+ * Sanitize filename from URL path
90
+ */
91
+ sanitizeFilename(path: string): string {
92
+ return path
93
+ .toLowerCase()
94
+ .replace(/[^a-z0-9/]+/g, "-")
95
+ .replace(/^-|-$/g, "")
96
+ .replace(/\//g, "/");
97
+ }
98
+
99
+ /**
100
+ * Build URL for a documentation page
101
+ */
102
+ buildUrl(category: string, page: string): string {
103
+ if (category) {
104
+ return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
105
+ } else {
106
+ return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
107
+ }
108
+ }
109
+
110
+ /**
111
+ * Download a single documentation page
112
+ */
113
+ async downloadPage(category: string, page: string): Promise<DocPage | null> {
114
+ const url = this.buildUrl(category, page);
115
+ const content = await this.fetchMarkdown(url);
116
+
117
+ if (!content) {
118
+ return null;
119
+ }
120
+
121
+ return {
122
+ url,
123
+ title: this.extractTitle(content),
124
+ content,
125
+ category,
126
+ pageName: page, // Store the page name for saving
127
+ };
128
+ }
129
+
130
+ /**
131
+ * Scrape pages discovered from llms.txt
132
+ */
133
+ async scrapeFromLlms(): Promise<ScraperResult> {
134
+ const startTime = Date.now();
135
+ const downloaded: DocPage[] = [];
136
+ const failed: Array<{ url: string; error: string }> = [];
137
+
138
+ const pages = await this.discoverPages();
139
+
140
+ if (pages.length === 0) {
141
+ console.log("No pages discovered, falling back to categories");
142
+ return this.scrape();
143
+ }
144
+
145
+ console.log(`Scraping ${pages.length} discovered pages...`);
146
+
147
+ // Process pages in batches
148
+ for (let i = 0; i < pages.length; i += this.options.concurrency) {
149
+ const batch = pages.slice(i, i + this.options.concurrency);
150
+ const results = await Promise.allSettled(
151
+ batch.map((page) => this.downloadPage(page.category, page.page))
152
+ );
153
+
154
+ results.forEach((result, index) => {
155
+ const page = batch[index];
156
+ if (result.status === "fulfilled" && result.value) {
157
+ downloaded.push(result.value);
158
+ } else {
159
+ failed.push({
160
+ url: this.buildUrl(page.category, page.page),
161
+ error: result.status === "rejected" ? (result.reason as string) : "Not found",
162
+ });
163
+ }
164
+ this.options.onProgress(downloaded.length + failed.length, pages.length);
165
+ });
166
+ }
167
+
168
+ const duration = Date.now() - startTime;
169
+
170
+ console.log(`✅ Downloaded: ${downloaded.length} pages`);
171
+ console.log(`❌ Failed: ${failed.length} pages`);
172
+ console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
173
+
174
+ return { downloaded, failed, duration };
175
+ }
176
+
177
+ /**
178
+ * Scrape all documentation pages
179
+ */
180
+ async scrape(): Promise<ScraperResult> {
181
+ const startTime = Date.now();
182
+ const downloaded: DocPage[] = [];
183
+ const failed: Array<{ url: string; error: string }> = [];
184
+
185
+ const pages = this.getPagesToScrape();
186
+ const total = pages.length;
187
+
188
+ console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
189
+
190
+ // Process pages in batches
191
+ for (let i = 0; i < pages.length; i += this.options.concurrency) {
192
+ const batch = pages.slice(i, i + this.options.concurrency);
193
+ const results = await Promise.allSettled(
194
+ batch.map((page) => this.downloadPage(page.category, page.page))
195
+ );
196
+
197
+ results.forEach((result, index) => {
198
+ const page = batch[index];
199
+ if (result.status === "fulfilled" && result.value) {
200
+ downloaded.push(result.value);
201
+ } else {
202
+ failed.push({
203
+ url: this.buildUrl(page.category, page.page),
204
+ error: result.status === "rejected" ? result.reason : "Not found",
205
+ });
206
+ }
207
+ this.options.onProgress(downloaded.length + failed.length, total);
208
+ });
209
+ }
210
+
211
+ const duration = Date.now() - startTime;
212
+
213
+ console.log(`✅ Downloaded: ${downloaded.length} pages`);
214
+ console.log(`❌ Failed: ${failed.length} pages`);
215
+ console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
216
+
217
+ return { downloaded, failed, duration };
218
+ }
219
+
220
+ /**
221
+ * Save scraped pages to disk
222
+ */
223
+ async savePages(pages: DocPage[]): Promise<void> {
224
+ const fs = await import("fs/promises");
225
+ const path = await import("path");
226
+
227
+ for (const page of pages) {
228
+ // Use pageName if available, otherwise extract from URL
229
+ const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
230
+
231
+ const dir = page.category
232
+ ? path.join(this.options.outputDir, page.category)
233
+ : this.options.outputDir;
234
+
235
+ await fs.mkdir(dir, { recursive: true });
236
+
237
+ const filepath = path.join(dir, `${nameToUse}.md`);
238
+
239
+ const header = `<!--\nSource: ${page.url}\nDownloaded: ${new Date().toISOString()}\n-->\n\n`;
240
+ await fs.writeFile(filepath, header + page.content, "utf-8");
241
+ }
242
+ }
243
+
244
+ /**
245
+ * Get list of pages to scrape based on categories
246
+ */
247
+ private getPagesToScrape(): Array<{ category: string; page: string }> {
248
+ const pages: Array<{ category: string; page: string }> = [];
249
+
250
+ for (const [category, pageList] of Object.entries(this.options.categories)) {
251
+ for (const page of pageList) {
252
+ pages.push({ category, page });
253
+ }
254
+ }
255
+
256
+ return pages;
257
+ }
258
+
259
+ /**
260
+ * Discover pages from llms.txt index
261
+ */
262
+ async discoverPages(): Promise<Array<{ category: string; page: string }>> {
263
+ const pages: Array<{ category: string; page: string }> = [];
264
+
265
+ try {
266
+ const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
267
+ const response = await fetch(llmsUrl, {
268
+ headers: {
269
+ Accept: "text/plain",
270
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
271
+ },
272
+ });
273
+
274
+ if (!response.ok) {
275
+ console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
276
+ return pages;
277
+ }
278
+
279
+ const content = await response.text();
280
+
281
+ // Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
282
+ const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
283
+ let match;
284
+
285
+ while ((match = linkRegex.exec(content)) !== null) {
286
+ const url = match[2];
287
+ const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
288
+
289
+ // Remove .md extension
290
+ const pageName = pagePath.replace(".md", "");
291
+
292
+ // Check if there's a category in the path
293
+ const pathParts = pageName.split("/");
294
+
295
+ if (pathParts.length === 1) {
296
+ // No category: just "page-name"
297
+ pages.push({ category: "", page: pathParts[0] });
298
+ } else if (pathParts.length === 2) {
299
+ // Has category: "category/page-name"
300
+ pages.push({ category: pathParts[0], page: pathParts[1] });
301
+ } else {
302
+ // Deeper path: join everything except last as category
303
+ const category = pathParts.slice(0, -1).join("/");
304
+ const page = pathParts[pathParts.length - 1];
305
+ pages.push({ category, page });
306
+ }
307
+ }
308
+
309
+ console.log(`Discovered ${pages.length} pages from llms.txt`);
310
+ } catch (error) {
311
+ console.error("Error discovering pages:", error);
312
+ }
313
+
314
+ return pages;
315
+ }
316
+
317
+ /**
318
+ * Discover additional pages by parsing the docs index (fallback)
319
+ */
320
+ async discoverPagesHtml(): Promise<string[]> {
321
+ const discovered: string[] = [];
322
+
323
+ try {
324
+ const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
325
+ const response = await fetch(indexUrl, {
326
+ headers: {
327
+ Accept: "text/html",
328
+ "User-Agent": "@ebowwa/markdown-docs-scraper",
329
+ },
330
+ });
331
+
332
+ if (!response.ok) {
333
+ return discovered;
334
+ }
335
+
336
+ const html = await response.text();
337
+ const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
338
+ let match;
339
+
340
+ while ((match = mdLinkRegex.exec(html)) !== null) {
341
+ const path = match[1];
342
+ if (!discovered.includes(path)) {
343
+ discovered.push(path);
344
+ }
345
+ }
346
+
347
+ console.log(`Discovered ${discovered.length} additional pages from HTML`);
348
+ } catch (error) {
349
+ console.error("Error discovering pages from HTML:", error);
350
+ }
351
+
352
+ return discovered;
353
+ }
354
+ }
355
+
356
+ // ============================================================================
357
+ // CONVENIENCE FUNCTION
358
+ // ============================================================================
359
+
360
+ /**
361
+ * Scrape markdown documentation with a single function call
362
+ */
363
+ export async function scrapeMarkdownDocs(
364
+ options: ScraperOptions & { useLlms?: boolean }
365
+ ): Promise<ScraperResult> {
366
+ const scraper = new MarkdownDocsScraper(options);
367
+ const result = options.useLlms
368
+ ? await scraper.scrapeFromLlms()
369
+ : await scraper.scrape();
370
+
371
+ if (options.outputDir) {
372
+ await scraper.savePages(result.downloaded);
373
+ }
374
+
375
+ return result;
376
+ }
377
+
378
+ // ============================================================================
379
+ // EXPORTS
380
+ // ============================================================================
381
+
382
+ export default MarkdownDocsScraper;