@ebowwa/markdown-docs-scraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +148 -0
- package/dist/cli.js +2457 -0
- package/dist/index.js +247 -0
- package/package.json +51 -0
- package/src/cli.ts +99 -0
- package/src/index.ts +382 -0
package/src/index.ts
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @ebowwa/markdown-docs-scraper
|
|
3
|
+
*
|
|
4
|
+
* Scrape and mirror markdown-based documentation sites
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// ============================================================================
|
|
8
|
+
// TYPES
|
|
9
|
+
// ============================================================================
|
|
10
|
+
|
|
11
|
+
export interface DocPage {
|
|
12
|
+
url: string;
|
|
13
|
+
title: string;
|
|
14
|
+
content: string;
|
|
15
|
+
category?: string;
|
|
16
|
+
pageName?: string; // The page filename (e.g., "agent-teams")
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface ScraperOptions {
|
|
20
|
+
baseUrl: string;
|
|
21
|
+
docsPath?: string;
|
|
22
|
+
categories?: Record<string, string[]>;
|
|
23
|
+
outputDir?: string;
|
|
24
|
+
concurrency?: number;
|
|
25
|
+
onProgress?: (current: number, total: number) => void;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface ScraperResult {
|
|
29
|
+
downloaded: DocPage[];
|
|
30
|
+
failed: Array<{ url: string; error: string }>;
|
|
31
|
+
duration: number;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// ============================================================================
|
|
35
|
+
// SCRAPER
|
|
36
|
+
// ============================================================================
|
|
37
|
+
|
|
38
|
+
export class MarkdownDocsScraper {
|
|
39
|
+
private options: Required<ScraperOptions>;
|
|
40
|
+
|
|
41
|
+
constructor(options: ScraperOptions) {
|
|
42
|
+
this.options = {
|
|
43
|
+
baseUrl: options.baseUrl,
|
|
44
|
+
docsPath: options.docsPath || "/docs/en",
|
|
45
|
+
categories: options.categories || {},
|
|
46
|
+
outputDir: options.outputDir || "./docs",
|
|
47
|
+
concurrency: options.concurrency || 5,
|
|
48
|
+
onProgress: options.onProgress || (() => {}),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Fetch markdown content from a URL
|
|
54
|
+
*/
|
|
55
|
+
async fetchMarkdown(url: string): Promise<string | null> {
|
|
56
|
+
try {
|
|
57
|
+
const response = await fetch(url, {
|
|
58
|
+
headers: {
|
|
59
|
+
Accept: "text/markdown, text/plain",
|
|
60
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
61
|
+
},
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
if (!response.ok) {
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const contentType = response.headers.get("content-type") || "";
|
|
69
|
+
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
|
|
70
|
+
// Try to parse anyway - some sites return incorrect content-type
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return await response.text();
|
|
74
|
+
} catch (error) {
|
|
75
|
+
console.error(`Error fetching ${url}:`, error);
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Extract title from markdown content
|
|
82
|
+
*/
|
|
83
|
+
extractTitle(markdown: string): string {
|
|
84
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
85
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Sanitize filename from URL path
|
|
90
|
+
*/
|
|
91
|
+
sanitizeFilename(path: string): string {
|
|
92
|
+
return path
|
|
93
|
+
.toLowerCase()
|
|
94
|
+
.replace(/[^a-z0-9/]+/g, "-")
|
|
95
|
+
.replace(/^-|-$/g, "")
|
|
96
|
+
.replace(/\//g, "/");
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Build URL for a documentation page
|
|
101
|
+
*/
|
|
102
|
+
buildUrl(category: string, page: string): string {
|
|
103
|
+
if (category) {
|
|
104
|
+
return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
|
|
105
|
+
} else {
|
|
106
|
+
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Download a single documentation page
|
|
112
|
+
*/
|
|
113
|
+
async downloadPage(category: string, page: string): Promise<DocPage | null> {
|
|
114
|
+
const url = this.buildUrl(category, page);
|
|
115
|
+
const content = await this.fetchMarkdown(url);
|
|
116
|
+
|
|
117
|
+
if (!content) {
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
url,
|
|
123
|
+
title: this.extractTitle(content),
|
|
124
|
+
content,
|
|
125
|
+
category,
|
|
126
|
+
pageName: page, // Store the page name for saving
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Scrape pages discovered from llms.txt
|
|
132
|
+
*/
|
|
133
|
+
async scrapeFromLlms(): Promise<ScraperResult> {
|
|
134
|
+
const startTime = Date.now();
|
|
135
|
+
const downloaded: DocPage[] = [];
|
|
136
|
+
const failed: Array<{ url: string; error: string }> = [];
|
|
137
|
+
|
|
138
|
+
const pages = await this.discoverPages();
|
|
139
|
+
|
|
140
|
+
if (pages.length === 0) {
|
|
141
|
+
console.log("No pages discovered, falling back to categories");
|
|
142
|
+
return this.scrape();
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
console.log(`Scraping ${pages.length} discovered pages...`);
|
|
146
|
+
|
|
147
|
+
// Process pages in batches
|
|
148
|
+
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
149
|
+
const batch = pages.slice(i, i + this.options.concurrency);
|
|
150
|
+
const results = await Promise.allSettled(
|
|
151
|
+
batch.map((page) => this.downloadPage(page.category, page.page))
|
|
152
|
+
);
|
|
153
|
+
|
|
154
|
+
results.forEach((result, index) => {
|
|
155
|
+
const page = batch[index];
|
|
156
|
+
if (result.status === "fulfilled" && result.value) {
|
|
157
|
+
downloaded.push(result.value);
|
|
158
|
+
} else {
|
|
159
|
+
failed.push({
|
|
160
|
+
url: this.buildUrl(page.category, page.page),
|
|
161
|
+
error: result.status === "rejected" ? (result.reason as string) : "Not found",
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
this.options.onProgress(downloaded.length + failed.length, pages.length);
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const duration = Date.now() - startTime;
|
|
169
|
+
|
|
170
|
+
console.log(`✅ Downloaded: ${downloaded.length} pages`);
|
|
171
|
+
console.log(`❌ Failed: ${failed.length} pages`);
|
|
172
|
+
console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
|
|
173
|
+
|
|
174
|
+
return { downloaded, failed, duration };
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Scrape all documentation pages
|
|
179
|
+
*/
|
|
180
|
+
async scrape(): Promise<ScraperResult> {
|
|
181
|
+
const startTime = Date.now();
|
|
182
|
+
const downloaded: DocPage[] = [];
|
|
183
|
+
const failed: Array<{ url: string; error: string }> = [];
|
|
184
|
+
|
|
185
|
+
const pages = this.getPagesToScrape();
|
|
186
|
+
const total = pages.length;
|
|
187
|
+
|
|
188
|
+
console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
|
|
189
|
+
|
|
190
|
+
// Process pages in batches
|
|
191
|
+
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
192
|
+
const batch = pages.slice(i, i + this.options.concurrency);
|
|
193
|
+
const results = await Promise.allSettled(
|
|
194
|
+
batch.map((page) => this.downloadPage(page.category, page.page))
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
results.forEach((result, index) => {
|
|
198
|
+
const page = batch[index];
|
|
199
|
+
if (result.status === "fulfilled" && result.value) {
|
|
200
|
+
downloaded.push(result.value);
|
|
201
|
+
} else {
|
|
202
|
+
failed.push({
|
|
203
|
+
url: this.buildUrl(page.category, page.page),
|
|
204
|
+
error: result.status === "rejected" ? result.reason : "Not found",
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
this.options.onProgress(downloaded.length + failed.length, total);
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
const duration = Date.now() - startTime;
|
|
212
|
+
|
|
213
|
+
console.log(`✅ Downloaded: ${downloaded.length} pages`);
|
|
214
|
+
console.log(`❌ Failed: ${failed.length} pages`);
|
|
215
|
+
console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
|
|
216
|
+
|
|
217
|
+
return { downloaded, failed, duration };
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Save scraped pages to disk
|
|
222
|
+
*/
|
|
223
|
+
async savePages(pages: DocPage[]): Promise<void> {
|
|
224
|
+
const fs = await import("fs/promises");
|
|
225
|
+
const path = await import("path");
|
|
226
|
+
|
|
227
|
+
for (const page of pages) {
|
|
228
|
+
// Use pageName if available, otherwise extract from URL
|
|
229
|
+
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
230
|
+
|
|
231
|
+
const dir = page.category
|
|
232
|
+
? path.join(this.options.outputDir, page.category)
|
|
233
|
+
: this.options.outputDir;
|
|
234
|
+
|
|
235
|
+
await fs.mkdir(dir, { recursive: true });
|
|
236
|
+
|
|
237
|
+
const filepath = path.join(dir, `${nameToUse}.md`);
|
|
238
|
+
|
|
239
|
+
const header = `<!--\nSource: ${page.url}\nDownloaded: ${new Date().toISOString()}\n-->\n\n`;
|
|
240
|
+
await fs.writeFile(filepath, header + page.content, "utf-8");
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Get list of pages to scrape based on categories
|
|
246
|
+
*/
|
|
247
|
+
private getPagesToScrape(): Array<{ category: string; page: string }> {
|
|
248
|
+
const pages: Array<{ category: string; page: string }> = [];
|
|
249
|
+
|
|
250
|
+
for (const [category, pageList] of Object.entries(this.options.categories)) {
|
|
251
|
+
for (const page of pageList) {
|
|
252
|
+
pages.push({ category, page });
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return pages;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Discover pages from llms.txt index
|
|
261
|
+
*/
|
|
262
|
+
async discoverPages(): Promise<Array<{ category: string; page: string }>> {
|
|
263
|
+
const pages: Array<{ category: string; page: string }> = [];
|
|
264
|
+
|
|
265
|
+
try {
|
|
266
|
+
const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
|
|
267
|
+
const response = await fetch(llmsUrl, {
|
|
268
|
+
headers: {
|
|
269
|
+
Accept: "text/plain",
|
|
270
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
271
|
+
},
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
if (!response.ok) {
|
|
275
|
+
console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
|
|
276
|
+
return pages;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
const content = await response.text();
|
|
280
|
+
|
|
281
|
+
// Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
|
|
282
|
+
const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
283
|
+
let match;
|
|
284
|
+
|
|
285
|
+
while ((match = linkRegex.exec(content)) !== null) {
|
|
286
|
+
const url = match[2];
|
|
287
|
+
const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
|
|
288
|
+
|
|
289
|
+
// Remove .md extension
|
|
290
|
+
const pageName = pagePath.replace(".md", "");
|
|
291
|
+
|
|
292
|
+
// Check if there's a category in the path
|
|
293
|
+
const pathParts = pageName.split("/");
|
|
294
|
+
|
|
295
|
+
if (pathParts.length === 1) {
|
|
296
|
+
// No category: just "page-name"
|
|
297
|
+
pages.push({ category: "", page: pathParts[0] });
|
|
298
|
+
} else if (pathParts.length === 2) {
|
|
299
|
+
// Has category: "category/page-name"
|
|
300
|
+
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
301
|
+
} else {
|
|
302
|
+
// Deeper path: join everything except last as category
|
|
303
|
+
const category = pathParts.slice(0, -1).join("/");
|
|
304
|
+
const page = pathParts[pathParts.length - 1];
|
|
305
|
+
pages.push({ category, page });
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
310
|
+
} catch (error) {
|
|
311
|
+
console.error("Error discovering pages:", error);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return pages;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Discover additional pages by parsing the docs index (fallback)
|
|
319
|
+
*/
|
|
320
|
+
async discoverPagesHtml(): Promise<string[]> {
|
|
321
|
+
const discovered: string[] = [];
|
|
322
|
+
|
|
323
|
+
try {
|
|
324
|
+
const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
|
|
325
|
+
const response = await fetch(indexUrl, {
|
|
326
|
+
headers: {
|
|
327
|
+
Accept: "text/html",
|
|
328
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
329
|
+
},
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
if (!response.ok) {
|
|
333
|
+
return discovered;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
const html = await response.text();
|
|
337
|
+
const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
338
|
+
let match;
|
|
339
|
+
|
|
340
|
+
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
341
|
+
const path = match[1];
|
|
342
|
+
if (!discovered.includes(path)) {
|
|
343
|
+
discovered.push(path);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
console.log(`Discovered ${discovered.length} additional pages from HTML`);
|
|
348
|
+
} catch (error) {
|
|
349
|
+
console.error("Error discovering pages from HTML:", error);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
return discovered;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// ============================================================================
|
|
357
|
+
// CONVENIENCE FUNCTION
|
|
358
|
+
// ============================================================================
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Scrape markdown documentation with a single function call
|
|
362
|
+
*/
|
|
363
|
+
export async function scrapeMarkdownDocs(
|
|
364
|
+
options: ScraperOptions & { useLlms?: boolean }
|
|
365
|
+
): Promise<ScraperResult> {
|
|
366
|
+
const scraper = new MarkdownDocsScraper(options);
|
|
367
|
+
const result = options.useLlms
|
|
368
|
+
? await scraper.scrapeFromLlms()
|
|
369
|
+
: await scraper.scrape();
|
|
370
|
+
|
|
371
|
+
if (options.outputDir) {
|
|
372
|
+
await scraper.savePages(result.downloaded);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return result;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// ============================================================================
|
|
379
|
+
// EXPORTS
|
|
380
|
+
// ============================================================================
|
|
381
|
+
|
|
382
|
+
export default MarkdownDocsScraper;
|