@ebowwa/markdown-docs-scraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +148 -0
- package/dist/cli.js +2457 -0
- package/dist/index.js +247 -0
- package/package.json +51 -0
- package/src/cli.ts +99 -0
- package/src/index.ts +382 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { createRequire } from "node:module";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
4
|
+
var __defProp = Object.defineProperty;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
7
|
+
var __toESM = (mod, isNodeMode, target) => {
|
|
8
|
+
target = mod != null ? __create(__getProtoOf(mod)) : {};
|
|
9
|
+
const to = isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target;
|
|
10
|
+
for (let key of __getOwnPropNames(mod))
|
|
11
|
+
if (!__hasOwnProp.call(to, key))
|
|
12
|
+
__defProp(to, key, {
|
|
13
|
+
get: () => mod[key],
|
|
14
|
+
enumerable: true
|
|
15
|
+
});
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
class MarkdownDocsScraper {
|
|
22
|
+
options;
|
|
23
|
+
constructor(options) {
|
|
24
|
+
this.options = {
|
|
25
|
+
baseUrl: options.baseUrl,
|
|
26
|
+
docsPath: options.docsPath || "/docs/en",
|
|
27
|
+
categories: options.categories || {},
|
|
28
|
+
outputDir: options.outputDir || "./docs",
|
|
29
|
+
concurrency: options.concurrency || 5,
|
|
30
|
+
onProgress: options.onProgress || (() => {})
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
async fetchMarkdown(url) {
|
|
34
|
+
try {
|
|
35
|
+
const response = await fetch(url, {
|
|
36
|
+
headers: {
|
|
37
|
+
Accept: "text/markdown, text/plain",
|
|
38
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
if (!response.ok) {
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
const contentType = response.headers.get("content-type") || "";
|
|
45
|
+
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {}
|
|
46
|
+
return await response.text();
|
|
47
|
+
} catch (error) {
|
|
48
|
+
console.error(`Error fetching ${url}:`, error);
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
extractTitle(markdown) {
|
|
53
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
54
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
55
|
+
}
|
|
56
|
+
sanitizeFilename(path) {
|
|
57
|
+
return path.toLowerCase().replace(/[^a-z0-9/]+/g, "-").replace(/^-|-$/g, "").replace(/\//g, "/");
|
|
58
|
+
}
|
|
59
|
+
buildUrl(category, page) {
|
|
60
|
+
if (category) {
|
|
61
|
+
return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
|
|
62
|
+
} else {
|
|
63
|
+
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
async downloadPage(category, page) {
|
|
67
|
+
const url = this.buildUrl(category, page);
|
|
68
|
+
const content = await this.fetchMarkdown(url);
|
|
69
|
+
if (!content) {
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
return {
|
|
73
|
+
url,
|
|
74
|
+
title: this.extractTitle(content),
|
|
75
|
+
content,
|
|
76
|
+
category,
|
|
77
|
+
pageName: page
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
async scrapeFromLlms() {
|
|
81
|
+
const startTime = Date.now();
|
|
82
|
+
const downloaded = [];
|
|
83
|
+
const failed = [];
|
|
84
|
+
const pages = await this.discoverPages();
|
|
85
|
+
if (pages.length === 0) {
|
|
86
|
+
console.log("No pages discovered, falling back to categories");
|
|
87
|
+
return this.scrape();
|
|
88
|
+
}
|
|
89
|
+
console.log(`Scraping ${pages.length} discovered pages...`);
|
|
90
|
+
for (let i = 0;i < pages.length; i += this.options.concurrency) {
|
|
91
|
+
const batch = pages.slice(i, i + this.options.concurrency);
|
|
92
|
+
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
|
|
93
|
+
results.forEach((result, index) => {
|
|
94
|
+
const page = batch[index];
|
|
95
|
+
if (result.status === "fulfilled" && result.value) {
|
|
96
|
+
downloaded.push(result.value);
|
|
97
|
+
} else {
|
|
98
|
+
failed.push({
|
|
99
|
+
url: this.buildUrl(page.category, page.page),
|
|
100
|
+
error: result.status === "rejected" ? result.reason : "Not found"
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
this.options.onProgress(downloaded.length + failed.length, pages.length);
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
const duration = Date.now() - startTime;
|
|
107
|
+
console.log(`✅ Downloaded: ${downloaded.length} pages`);
|
|
108
|
+
console.log(`❌ Failed: ${failed.length} pages`);
|
|
109
|
+
console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
|
|
110
|
+
return { downloaded, failed, duration };
|
|
111
|
+
}
|
|
112
|
+
async scrape() {
|
|
113
|
+
const startTime = Date.now();
|
|
114
|
+
const downloaded = [];
|
|
115
|
+
const failed = [];
|
|
116
|
+
const pages = this.getPagesToScrape();
|
|
117
|
+
const total = pages.length;
|
|
118
|
+
console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
|
|
119
|
+
for (let i = 0;i < pages.length; i += this.options.concurrency) {
|
|
120
|
+
const batch = pages.slice(i, i + this.options.concurrency);
|
|
121
|
+
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page.category, page.page)));
|
|
122
|
+
results.forEach((result, index) => {
|
|
123
|
+
const page = batch[index];
|
|
124
|
+
if (result.status === "fulfilled" && result.value) {
|
|
125
|
+
downloaded.push(result.value);
|
|
126
|
+
} else {
|
|
127
|
+
failed.push({
|
|
128
|
+
url: this.buildUrl(page.category, page.page),
|
|
129
|
+
error: result.status === "rejected" ? result.reason : "Not found"
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
this.options.onProgress(downloaded.length + failed.length, total);
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
const duration = Date.now() - startTime;
|
|
136
|
+
console.log(`✅ Downloaded: ${downloaded.length} pages`);
|
|
137
|
+
console.log(`❌ Failed: ${failed.length} pages`);
|
|
138
|
+
console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
|
|
139
|
+
return { downloaded, failed, duration };
|
|
140
|
+
}
|
|
141
|
+
async savePages(pages) {
|
|
142
|
+
const fs = await import("fs/promises");
|
|
143
|
+
const path = await import("path");
|
|
144
|
+
for (const page of pages) {
|
|
145
|
+
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
146
|
+
const dir = page.category ? path.join(this.options.outputDir, page.category) : this.options.outputDir;
|
|
147
|
+
await fs.mkdir(dir, { recursive: true });
|
|
148
|
+
const filepath = path.join(dir, `${nameToUse}.md`);
|
|
149
|
+
const header = `<!--
|
|
150
|
+
Source: ${page.url}
|
|
151
|
+
Downloaded: ${new Date().toISOString()}
|
|
152
|
+
-->
|
|
153
|
+
|
|
154
|
+
`;
|
|
155
|
+
await fs.writeFile(filepath, header + page.content, "utf-8");
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
getPagesToScrape() {
|
|
159
|
+
const pages = [];
|
|
160
|
+
for (const [category, pageList] of Object.entries(this.options.categories)) {
|
|
161
|
+
for (const page of pageList) {
|
|
162
|
+
pages.push({ category, page });
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return pages;
|
|
166
|
+
}
|
|
167
|
+
async discoverPages() {
|
|
168
|
+
const pages = [];
|
|
169
|
+
try {
|
|
170
|
+
const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
|
|
171
|
+
const response = await fetch(llmsUrl, {
|
|
172
|
+
headers: {
|
|
173
|
+
Accept: "text/plain",
|
|
174
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
if (!response.ok) {
|
|
178
|
+
console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
|
|
179
|
+
return pages;
|
|
180
|
+
}
|
|
181
|
+
const content = await response.text();
|
|
182
|
+
const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
183
|
+
let match;
|
|
184
|
+
while ((match = linkRegex.exec(content)) !== null) {
|
|
185
|
+
const url = match[2];
|
|
186
|
+
const pagePath = match[3];
|
|
187
|
+
const pageName = pagePath.replace(".md", "");
|
|
188
|
+
const pathParts = pageName.split("/");
|
|
189
|
+
if (pathParts.length === 1) {
|
|
190
|
+
pages.push({ category: "", page: pathParts[0] });
|
|
191
|
+
} else if (pathParts.length === 2) {
|
|
192
|
+
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
193
|
+
} else {
|
|
194
|
+
const category = pathParts.slice(0, -1).join("/");
|
|
195
|
+
const page = pathParts[pathParts.length - 1];
|
|
196
|
+
pages.push({ category, page });
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
200
|
+
} catch (error) {
|
|
201
|
+
console.error("Error discovering pages:", error);
|
|
202
|
+
}
|
|
203
|
+
return pages;
|
|
204
|
+
}
|
|
205
|
+
async discoverPagesHtml() {
|
|
206
|
+
const discovered = [];
|
|
207
|
+
try {
|
|
208
|
+
const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
|
|
209
|
+
const response = await fetch(indexUrl, {
|
|
210
|
+
headers: {
|
|
211
|
+
Accept: "text/html",
|
|
212
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
213
|
+
}
|
|
214
|
+
});
|
|
215
|
+
if (!response.ok) {
|
|
216
|
+
return discovered;
|
|
217
|
+
}
|
|
218
|
+
const html = await response.text();
|
|
219
|
+
const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
220
|
+
let match;
|
|
221
|
+
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
222
|
+
const path = match[1];
|
|
223
|
+
if (!discovered.includes(path)) {
|
|
224
|
+
discovered.push(path);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
console.log(`Discovered ${discovered.length} additional pages from HTML`);
|
|
228
|
+
} catch (error) {
|
|
229
|
+
console.error("Error discovering pages from HTML:", error);
|
|
230
|
+
}
|
|
231
|
+
return discovered;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
async function scrapeMarkdownDocs(options) {
|
|
235
|
+
const scraper = new MarkdownDocsScraper(options);
|
|
236
|
+
const result = options.useLlms ? await scraper.scrapeFromLlms() : await scraper.scrape();
|
|
237
|
+
if (options.outputDir) {
|
|
238
|
+
await scraper.savePages(result.downloaded);
|
|
239
|
+
}
|
|
240
|
+
return result;
|
|
241
|
+
}
|
|
242
|
+
var src_default = MarkdownDocsScraper;
|
|
243
|
+
export {
|
|
244
|
+
scrapeMarkdownDocs,
|
|
245
|
+
src_default as default,
|
|
246
|
+
MarkdownDocsScraper
|
|
247
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@ebowwa/markdown-docs-scraper",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Scrape and mirror markdown-based documentation sites",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"markdown-docs-scraper": "./dist/cli.js"
|
|
10
|
+
},
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"import": "./dist/index.js",
|
|
14
|
+
"types": "./dist/index.d.ts"
|
|
15
|
+
},
|
|
16
|
+
"./cli": {
|
|
17
|
+
"import": "./dist/cli.js",
|
|
18
|
+
"types": "./dist/cli.d.ts"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"scripts": {
|
|
22
|
+
"build": "bun build src/index.ts --outdir dist --target node && bun build src/cli.ts --outdir dist --target node",
|
|
23
|
+
"dev": "bun run src/cli.ts",
|
|
24
|
+
"test": "bun test",
|
|
25
|
+
"prepublishOnly": "bun run build"
|
|
26
|
+
},
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"commander": "^12.0.0"
|
|
29
|
+
},
|
|
30
|
+
"devDependencies": {
|
|
31
|
+
"@types/node": "^20.11.0",
|
|
32
|
+
"typescript": "^5.3.3"
|
|
33
|
+
},
|
|
34
|
+
"engines": {
|
|
35
|
+
"node": ">=20.0.0"
|
|
36
|
+
},
|
|
37
|
+
"keywords": [
|
|
38
|
+
"markdown",
|
|
39
|
+
"documentation",
|
|
40
|
+
"scraper",
|
|
41
|
+
"mirror",
|
|
42
|
+
"cli"
|
|
43
|
+
],
|
|
44
|
+
"author": "Ebowwa",
|
|
45
|
+
"license": "MIT",
|
|
46
|
+
"repository": {
|
|
47
|
+
"type": "git",
|
|
48
|
+
"url": "https://github.com/ebowwa/codespaces",
|
|
49
|
+
"directory": "packages/src/markdown-docs-scraper"
|
|
50
|
+
}
|
|
51
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* CLI for @ebowwa/markdown-docs-scraper
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { program } from "commander";
|
|
7
|
+
import { scrapeMarkdownDocs, MarkdownDocsScraper, type ScraperOptions } from "./index.js";
|
|
8
|
+
|
|
9
|
+
program
|
|
10
|
+
.name("markdown-docs-scraper")
|
|
11
|
+
.description("Scrape and mirror markdown-based documentation sites")
|
|
12
|
+
.version("1.0.0");
|
|
13
|
+
|
|
14
|
+
program
|
|
15
|
+
.command("scrape")
|
|
16
|
+
.description("Scrape documentation from a URL")
|
|
17
|
+
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
18
|
+
.option("-o, --output <dir>", "Output directory", "./docs")
|
|
19
|
+
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
20
|
+
.option("-c, --concurrency <num>", "Concurrency level", "5")
|
|
21
|
+
.option("--discover", "Discover pages before scraping", false)
|
|
22
|
+
.action(async (options) => {
|
|
23
|
+
const scraperOptions: ScraperOptions = {
|
|
24
|
+
baseUrl: options.url,
|
|
25
|
+
docsPath: options.docsPath,
|
|
26
|
+
outputDir: options.output,
|
|
27
|
+
concurrency: parseInt(options.concurrency),
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
console.log(`🔍 Scraping ${options.url}...`);
|
|
31
|
+
console.log(`📁 Output: ${options.output}`);
|
|
32
|
+
console.log();
|
|
33
|
+
|
|
34
|
+
const result = await scrapeMarkdownDocs(scraperOptions);
|
|
35
|
+
|
|
36
|
+
console.log();
|
|
37
|
+
console.log("Summary:");
|
|
38
|
+
console.log(` Downloaded: ${result.downloaded.length}`);
|
|
39
|
+
console.log(` Failed: ${result.failed.length}`);
|
|
40
|
+
console.log(` Duration: ${(result.duration / 1000).toFixed(2)}s`);
|
|
41
|
+
|
|
42
|
+
if (result.failed.length > 0) {
|
|
43
|
+
console.log();
|
|
44
|
+
console.log("Failed pages:");
|
|
45
|
+
result.failed.slice(0, 10).forEach((f) => {
|
|
46
|
+
console.log(` ❌ ${f.url}: ${f.error}`);
|
|
47
|
+
});
|
|
48
|
+
if (result.failed.length > 10) {
|
|
49
|
+
console.log(` ... and ${result.failed.length - 10} more`);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
program
|
|
55
|
+
.command("discover")
|
|
56
|
+
.description("Discover all available documentation pages")
|
|
57
|
+
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
58
|
+
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
59
|
+
.action(async (options) => {
|
|
60
|
+
const scraper = new MarkdownDocsScraper({
|
|
61
|
+
baseUrl: options.url,
|
|
62
|
+
docsPath: options.docsPath,
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
console.log(`🔍 Discovering pages from ${options.url}...`);
|
|
66
|
+
const pages = await scraper.discoverPages();
|
|
67
|
+
|
|
68
|
+
console.log(`\nFound ${pages.length} pages:\n`);
|
|
69
|
+
pages.forEach((page) => {
|
|
70
|
+
console.log(` - ${page}`);
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
program
|
|
75
|
+
.command("anthropic")
|
|
76
|
+
.description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)")
|
|
77
|
+
.option("-o, --output <dir>", "Output directory", "./docs")
|
|
78
|
+
.action(async (options) => {
|
|
79
|
+
const scraperOptions: ScraperOptions = {
|
|
80
|
+
baseUrl: "https://code.claude.com",
|
|
81
|
+
docsPath: "/docs/en",
|
|
82
|
+
outputDir: options.output,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
console.log("🔍 Scraping Anthropic Claude Code documentation...");
|
|
86
|
+
console.log("📋 Using llms.txt index for complete page list");
|
|
87
|
+
console.log(`📁 Output: ${options.output}`);
|
|
88
|
+
console.log();
|
|
89
|
+
|
|
90
|
+
const result = await scrapeMarkdownDocs({ ...scraperOptions, useLlms: true });
|
|
91
|
+
|
|
92
|
+
console.log();
|
|
93
|
+
console.log("Summary:");
|
|
94
|
+
console.log(` Downloaded: ${result.downloaded.length}`);
|
|
95
|
+
console.log(` Failed: ${result.failed.length}`);
|
|
96
|
+
console.log(` Duration: ${(result.duration / 1000).toFixed(2)}s`);
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
program.parse();
|