@ebowwa/markdown-docs-scraper 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -0
- package/dist/cli.js +18 -13
- package/dist/index.js +142 -97
- package/package.json +1 -1
- package/src/cli.ts +10 -1
- package/src/index.ts +215 -150
package/README.md
CHANGED
|
@@ -52,6 +52,26 @@ Options:
|
|
|
52
52
|
-o, --output <dir> Output directory (default: "./docs")
|
|
53
53
|
--docs-path <path> Docs path (default: "/docs/en")
|
|
54
54
|
-c, --concurrency <num> Concurrency level (default: "5")
|
|
55
|
+
--llms-paths <paths> Comma-separated llms.txt paths (default: "/llms.txt,/docs/llms.txt")
|
|
56
|
+
--no-subdomain Disable docs/doc subdomain fallback
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### llms.txt Discovery
|
|
60
|
+
|
|
61
|
+
The scraper automatically tries multiple paths to find `llms.txt`:
|
|
62
|
+
|
|
63
|
+
1. **Configured paths** (default: `/llms.txt`, `/docs/llms.txt`)
|
|
64
|
+
2. **Docs subdomain** (e.g., `https://docs.example.com/llms.txt`)
|
|
65
|
+
3. **Doc subdomain** (e.g., `https://doc.example.com/llms.txt`)
|
|
66
|
+
|
|
67
|
+
Example with custom paths:
|
|
68
|
+
```bash
|
|
69
|
+
markdown-docs-scraper scrape -u https://example.com --llms-paths "/llms.txt,/api/llms.txt"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Disable subdomain fallback:
|
|
73
|
+
```bash
|
|
74
|
+
markdown-docs-scraper scrape -u https://example.com --no-subdomain
|
|
55
75
|
```
|
|
56
76
|
|
|
57
77
|
## Programmatic Usage
|
|
@@ -103,6 +123,8 @@ interface ScraperOptions {
|
|
|
103
123
|
outputDir?: string; // Output directory (default: "./docs")
|
|
104
124
|
concurrency?: number; // Concurrent downloads (default: 5)
|
|
105
125
|
onProgress?: (current: number, total: number) => void;
|
|
126
|
+
llmsPaths?: string[]; // llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"])
|
|
127
|
+
tryDocsSubdomain?: boolean; // Also try docs/doc subdomains (default: true)
|
|
106
128
|
}
|
|
107
129
|
```
|
|
108
130
|
|
package/dist/cli.js
CHANGED
|
@@ -20,7 +20,7 @@ var __toESM = (mod, isNodeMode, target) => {
|
|
|
20
20
|
var __commonJS = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
|
|
21
21
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
22
22
|
|
|
23
|
-
// node_modules/commander/lib/error.js
|
|
23
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/error.js
|
|
24
24
|
var require_error = __commonJS((exports) => {
|
|
25
25
|
class CommanderError extends Error {
|
|
26
26
|
constructor(exitCode, code, message) {
|
|
@@ -44,7 +44,7 @@ var require_error = __commonJS((exports) => {
|
|
|
44
44
|
exports.InvalidArgumentError = InvalidArgumentError;
|
|
45
45
|
});
|
|
46
46
|
|
|
47
|
-
// node_modules/commander/lib/argument.js
|
|
47
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/argument.js
|
|
48
48
|
var require_argument = __commonJS((exports) => {
|
|
49
49
|
var { InvalidArgumentError } = require_error();
|
|
50
50
|
|
|
@@ -123,7 +123,7 @@ var require_argument = __commonJS((exports) => {
|
|
|
123
123
|
exports.humanReadableArgName = humanReadableArgName;
|
|
124
124
|
});
|
|
125
125
|
|
|
126
|
-
// node_modules/commander/lib/help.js
|
|
126
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/help.js
|
|
127
127
|
var require_help = __commonJS((exports) => {
|
|
128
128
|
var { humanReadableArgName } = require_argument();
|
|
129
129
|
|
|
@@ -372,7 +372,7 @@ var require_help = __commonJS((exports) => {
|
|
|
372
372
|
exports.Help = Help;
|
|
373
373
|
});
|
|
374
374
|
|
|
375
|
-
// node_modules/commander/lib/option.js
|
|
375
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/option.js
|
|
376
376
|
var require_option = __commonJS((exports) => {
|
|
377
377
|
var { InvalidArgumentError } = require_error();
|
|
378
378
|
|
|
@@ -523,7 +523,7 @@ var require_option = __commonJS((exports) => {
|
|
|
523
523
|
exports.DualOptions = DualOptions;
|
|
524
524
|
});
|
|
525
525
|
|
|
526
|
-
// node_modules/commander/lib/suggestSimilar.js
|
|
526
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/suggestSimilar.js
|
|
527
527
|
var require_suggestSimilar = __commonJS((exports) => {
|
|
528
528
|
var maxDistance = 3;
|
|
529
529
|
function editDistance(a, b) {
|
|
@@ -596,7 +596,7 @@ var require_suggestSimilar = __commonJS((exports) => {
|
|
|
596
596
|
exports.suggestSimilar = suggestSimilar;
|
|
597
597
|
});
|
|
598
598
|
|
|
599
|
-
// node_modules/commander/lib/command.js
|
|
599
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/lib/command.js
|
|
600
600
|
var require_command = __commonJS((exports) => {
|
|
601
601
|
var EventEmitter = __require("node:events").EventEmitter;
|
|
602
602
|
var childProcess = __require("node:child_process");
|
|
@@ -1839,7 +1839,7 @@ Expecting one of '${allowedValues.join("', '")}'`);
|
|
|
1839
1839
|
exports.Command = Command;
|
|
1840
1840
|
});
|
|
1841
1841
|
|
|
1842
|
-
// node_modules/commander/index.js
|
|
1842
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/index.js
|
|
1843
1843
|
var require_commander = __commonJS((exports) => {
|
|
1844
1844
|
var { Argument } = require_argument();
|
|
1845
1845
|
var { Command } = require_command();
|
|
@@ -2377,7 +2377,7 @@ Downloaded: `).concat(new Date().toISOString(), `
|
|
|
2377
2377
|
exports.default = MarkdownDocsScraper;
|
|
2378
2378
|
});
|
|
2379
2379
|
|
|
2380
|
-
// node_modules/commander/esm.mjs
|
|
2380
|
+
// ../../node_modules/.bun/commander@12.1.0/node_modules/commander/esm.mjs
|
|
2381
2381
|
var import__ = __toESM(require_commander(), 1);
|
|
2382
2382
|
var {
|
|
2383
2383
|
program,
|
|
@@ -2396,12 +2396,14 @@ var {
|
|
|
2396
2396
|
// src/cli.ts
|
|
2397
2397
|
var import__2 = __toESM(require_src(), 1);
|
|
2398
2398
|
program.name("markdown-docs-scraper").description("Scrape and mirror markdown-based documentation sites").version("1.0.0");
|
|
2399
|
-
program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).action(async (options) => {
|
|
2399
|
+
program.command("scrape").description("Scrape documentation from a URL").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("-o, --output <dir>", "Output directory", "./docs").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("-c, --concurrency <num>", "Concurrency level", "5").option("--discover", "Discover pages before scraping", false).option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
|
|
2400
2400
|
const scraperOptions = {
|
|
2401
2401
|
baseUrl: options.url,
|
|
2402
2402
|
docsPath: options.docsPath,
|
|
2403
2403
|
outputDir: options.output,
|
|
2404
|
-
concurrency: parseInt(options.concurrency)
|
|
2404
|
+
concurrency: parseInt(options.concurrency),
|
|
2405
|
+
llmsPaths: options.llmsPaths.split(","),
|
|
2406
|
+
tryDocsSubdomain: !options.noSubdomain
|
|
2405
2407
|
};
|
|
2406
2408
|
console.log(`\uD83D\uDD0D Scraping ${options.url}...`);
|
|
2407
2409
|
console.log(`\uD83D\uDCC1 Output: ${options.output}`);
|
|
@@ -2423,10 +2425,12 @@ program.command("scrape").description("Scrape documentation from a URL").require
|
|
|
2423
2425
|
}
|
|
2424
2426
|
}
|
|
2425
2427
|
});
|
|
2426
|
-
program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").action(async (options) => {
|
|
2428
|
+
program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
|
|
2427
2429
|
const scraper = new import__2.MarkdownDocsScraper({
|
|
2428
2430
|
baseUrl: options.url,
|
|
2429
|
-
docsPath: options.docsPath
|
|
2431
|
+
docsPath: options.docsPath,
|
|
2432
|
+
llmsPaths: options.llmsPaths.split(","),
|
|
2433
|
+
tryDocsSubdomain: !options.noSubdomain
|
|
2430
2434
|
});
|
|
2431
2435
|
console.log(`\uD83D\uDD0D Discovering pages from ${options.url}...`);
|
|
2432
2436
|
const pages = await scraper.discoverPages();
|
|
@@ -2434,7 +2438,8 @@ program.command("discover").description("Discover all available documentation pa
|
|
|
2434
2438
|
Found ${pages.length} pages:
|
|
2435
2439
|
`);
|
|
2436
2440
|
pages.forEach((page) => {
|
|
2437
|
-
|
|
2441
|
+
const path = page.category ? `${page.category}/${page.page}` : page.page;
|
|
2442
|
+
console.log(` - ${path}`);
|
|
2438
2443
|
});
|
|
2439
2444
|
});
|
|
2440
2445
|
program.command("anthropic").description("Quick scrape of Anthropic Claude Code docs (uses llms.txt)").option("-o, --output <dir>", "Output directory", "./docs").action(async (options) => {
|
package/dist/index.js
CHANGED
|
@@ -18,6 +18,43 @@ var __toESM = (mod, isNodeMode, target) => {
|
|
|
18
18
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
19
19
|
|
|
20
20
|
// src/index.ts
|
|
21
|
+
var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
|
|
22
|
+
function extractTitle(markdown) {
|
|
23
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
24
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
25
|
+
}
|
|
26
|
+
function parsePagePath(pagePath) {
|
|
27
|
+
const pageName = pagePath.replace(".md", "");
|
|
28
|
+
const pathParts = pageName.split("/");
|
|
29
|
+
if (pathParts.length === 1) {
|
|
30
|
+
return { category: "", page: pathParts[0] };
|
|
31
|
+
} else if (pathParts.length === 2) {
|
|
32
|
+
return { category: pathParts[0], page: pathParts[1] };
|
|
33
|
+
} else {
|
|
34
|
+
return {
|
|
35
|
+
category: pathParts.slice(0, -1).join("/"),
|
|
36
|
+
page: pathParts[pathParts.length - 1]
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
async function fetchMarkdown(url, userAgent = "@ebowwa/markdown-docs-scraper") {
|
|
41
|
+
try {
|
|
42
|
+
const response = await fetch(url, {
|
|
43
|
+
headers: {
|
|
44
|
+
Accept: "text/markdown, text/plain",
|
|
45
|
+
"User-Agent": userAgent
|
|
46
|
+
}
|
|
47
|
+
});
|
|
48
|
+
if (!response.ok) {
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
return await response.text();
|
|
52
|
+
} catch (error) {
|
|
53
|
+
console.error(`Error fetching ${url}:`, error);
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
21
58
|
class MarkdownDocsScraper {
|
|
22
59
|
options;
|
|
23
60
|
constructor(options) {
|
|
@@ -27,56 +64,99 @@ class MarkdownDocsScraper {
|
|
|
27
64
|
categories: options.categories || {},
|
|
28
65
|
outputDir: options.outputDir || "./docs",
|
|
29
66
|
concurrency: options.concurrency || 5,
|
|
30
|
-
onProgress: options.onProgress || (() => {})
|
|
67
|
+
onProgress: options.onProgress || (() => {}),
|
|
68
|
+
llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
|
|
69
|
+
tryDocsSubdomain: options.tryDocsSubdomain ?? true,
|
|
70
|
+
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN
|
|
31
71
|
};
|
|
32
72
|
}
|
|
33
|
-
async fetchMarkdown(url) {
|
|
34
|
-
try {
|
|
35
|
-
const response = await fetch(url, {
|
|
36
|
-
headers: {
|
|
37
|
-
Accept: "text/markdown, text/plain",
|
|
38
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
39
|
-
}
|
|
40
|
-
});
|
|
41
|
-
if (!response.ok) {
|
|
42
|
-
return null;
|
|
43
|
-
}
|
|
44
|
-
const contentType = response.headers.get("content-type") || "";
|
|
45
|
-
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {}
|
|
46
|
-
return await response.text();
|
|
47
|
-
} catch (error) {
|
|
48
|
-
console.error(`Error fetching ${url}:`, error);
|
|
49
|
-
return null;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
extractTitle(markdown) {
|
|
53
|
-
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
54
|
-
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
55
|
-
}
|
|
56
|
-
sanitizeFilename(path) {
|
|
57
|
-
return path.toLowerCase().replace(/[^a-z0-9/]+/g, "-").replace(/^-|-$/g, "").replace(/\//g, "/");
|
|
58
|
-
}
|
|
59
73
|
buildUrl(category, page) {
|
|
60
74
|
if (category) {
|
|
61
75
|
return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
|
|
62
|
-
} else {
|
|
76
|
+
} else if (this.options.docsPath) {
|
|
63
77
|
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
78
|
+
} else {
|
|
79
|
+
return `${this.options.baseUrl}/${page}.md`;
|
|
64
80
|
}
|
|
65
81
|
}
|
|
66
82
|
async downloadPage(category, page) {
|
|
67
83
|
const url = this.buildUrl(category, page);
|
|
68
|
-
const content = await
|
|
84
|
+
const content = await fetchMarkdown(url);
|
|
69
85
|
if (!content) {
|
|
70
86
|
return null;
|
|
71
87
|
}
|
|
72
88
|
return {
|
|
73
89
|
url,
|
|
74
|
-
title:
|
|
90
|
+
title: extractTitle(content),
|
|
75
91
|
content,
|
|
76
92
|
category,
|
|
77
93
|
pageName: page
|
|
78
94
|
};
|
|
79
95
|
}
|
|
96
|
+
getLlmsUrls() {
|
|
97
|
+
const urls = [];
|
|
98
|
+
const baseUrl = this.options.baseUrl;
|
|
99
|
+
for (const path of this.options.llmsPaths) {
|
|
100
|
+
urls.push(`${baseUrl}${path}`);
|
|
101
|
+
}
|
|
102
|
+
if (this.options.tryDocsSubdomain) {
|
|
103
|
+
try {
|
|
104
|
+
const url = new URL(baseUrl);
|
|
105
|
+
const hostname = url.hostname;
|
|
106
|
+
if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
|
|
107
|
+
const docsDomain = hostname.replace(/^www\./, "");
|
|
108
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
|
|
109
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
|
|
110
|
+
}
|
|
111
|
+
} catch {}
|
|
112
|
+
}
|
|
113
|
+
return urls;
|
|
114
|
+
}
|
|
115
|
+
async fetchLlmsTxt() {
|
|
116
|
+
const urls = this.getLlmsUrls();
|
|
117
|
+
for (const llmsUrl of urls) {
|
|
118
|
+
try {
|
|
119
|
+
const response = await fetch(llmsUrl, {
|
|
120
|
+
headers: {
|
|
121
|
+
Accept: "text/plain",
|
|
122
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
if (response.ok) {
|
|
126
|
+
const content = await response.text();
|
|
127
|
+
console.log(`Found llms.txt at ${llmsUrl}`);
|
|
128
|
+
return { content, url: llmsUrl };
|
|
129
|
+
}
|
|
130
|
+
} catch (error) {
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
async discoverPages() {
|
|
137
|
+
const pages = [];
|
|
138
|
+
try {
|
|
139
|
+
const llmsResult = await this.fetchLlmsTxt();
|
|
140
|
+
if (!llmsResult) {
|
|
141
|
+
const attemptedUrls = this.getLlmsUrls();
|
|
142
|
+
console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
|
|
143
|
+
return pages;
|
|
144
|
+
}
|
|
145
|
+
const { content } = llmsResult;
|
|
146
|
+
const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
|
|
147
|
+
let match;
|
|
148
|
+
while ((match = regex.exec(content)) !== null) {
|
|
149
|
+
const url = match[2];
|
|
150
|
+
const pagePath = match[3];
|
|
151
|
+
const { category, page } = parsePagePath(pagePath);
|
|
152
|
+
pages.push({ category, page });
|
|
153
|
+
}
|
|
154
|
+
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
155
|
+
} catch (error) {
|
|
156
|
+
console.error("Error discovering pages:", error);
|
|
157
|
+
}
|
|
158
|
+
return pages;
|
|
159
|
+
}
|
|
80
160
|
async scrapeFromLlms() {
|
|
81
161
|
const startTime = Date.now();
|
|
82
162
|
const downloaded = [];
|
|
@@ -164,72 +244,6 @@ Downloaded: ${new Date().toISOString()}
|
|
|
164
244
|
}
|
|
165
245
|
return pages;
|
|
166
246
|
}
|
|
167
|
-
async discoverPages() {
|
|
168
|
-
const pages = [];
|
|
169
|
-
try {
|
|
170
|
-
const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
|
|
171
|
-
const response = await fetch(llmsUrl, {
|
|
172
|
-
headers: {
|
|
173
|
-
Accept: "text/plain",
|
|
174
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
175
|
-
}
|
|
176
|
-
});
|
|
177
|
-
if (!response.ok) {
|
|
178
|
-
console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
|
|
179
|
-
return pages;
|
|
180
|
-
}
|
|
181
|
-
const content = await response.text();
|
|
182
|
-
const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
183
|
-
let match;
|
|
184
|
-
while ((match = linkRegex.exec(content)) !== null) {
|
|
185
|
-
const url = match[2];
|
|
186
|
-
const pagePath = match[3];
|
|
187
|
-
const pageName = pagePath.replace(".md", "");
|
|
188
|
-
const pathParts = pageName.split("/");
|
|
189
|
-
if (pathParts.length === 1) {
|
|
190
|
-
pages.push({ category: "", page: pathParts[0] });
|
|
191
|
-
} else if (pathParts.length === 2) {
|
|
192
|
-
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
193
|
-
} else {
|
|
194
|
-
const category = pathParts.slice(0, -1).join("/");
|
|
195
|
-
const page = pathParts[pathParts.length - 1];
|
|
196
|
-
pages.push({ category, page });
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
200
|
-
} catch (error) {
|
|
201
|
-
console.error("Error discovering pages:", error);
|
|
202
|
-
}
|
|
203
|
-
return pages;
|
|
204
|
-
}
|
|
205
|
-
async discoverPagesHtml() {
|
|
206
|
-
const discovered = [];
|
|
207
|
-
try {
|
|
208
|
-
const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
|
|
209
|
-
const response = await fetch(indexUrl, {
|
|
210
|
-
headers: {
|
|
211
|
-
Accept: "text/html",
|
|
212
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
213
|
-
}
|
|
214
|
-
});
|
|
215
|
-
if (!response.ok) {
|
|
216
|
-
return discovered;
|
|
217
|
-
}
|
|
218
|
-
const html = await response.text();
|
|
219
|
-
const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
220
|
-
let match;
|
|
221
|
-
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
222
|
-
const path = match[1];
|
|
223
|
-
if (!discovered.includes(path)) {
|
|
224
|
-
discovered.push(path);
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
console.log(`Discovered ${discovered.length} additional pages from HTML`);
|
|
228
|
-
} catch (error) {
|
|
229
|
-
console.error("Error discovering pages from HTML:", error);
|
|
230
|
-
}
|
|
231
|
-
return discovered;
|
|
232
|
-
}
|
|
233
247
|
}
|
|
234
248
|
async function scrapeMarkdownDocs(options) {
|
|
235
249
|
const scraper = new MarkdownDocsScraper(options);
|
|
@@ -239,9 +253,40 @@ async function scrapeMarkdownDocs(options) {
|
|
|
239
253
|
}
|
|
240
254
|
return result;
|
|
241
255
|
}
|
|
256
|
+
var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
257
|
+
var GENERIC_PATTERN = GENERIC_LINK_PATTERN;
|
|
258
|
+
function claudeCodeOptions(outputDir) {
|
|
259
|
+
return {
|
|
260
|
+
baseUrl: "https://code.claude.com",
|
|
261
|
+
docsPath: "/docs/en",
|
|
262
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
263
|
+
linkPattern: CLAUDE_CODE_PATTERN,
|
|
264
|
+
outputDir,
|
|
265
|
+
concurrency: 10,
|
|
266
|
+
tryDocsSubdomain: false
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
function polymarketOptions(outputDir) {
|
|
270
|
+
return {
|
|
271
|
+
baseUrl: "https://docs.polymarket.com",
|
|
272
|
+
docsPath: "",
|
|
273
|
+
llmsPaths: ["/llms.txt"],
|
|
274
|
+
linkPattern: GENERIC_PATTERN,
|
|
275
|
+
outputDir,
|
|
276
|
+
concurrency: 10,
|
|
277
|
+
tryDocsSubdomain: false
|
|
278
|
+
};
|
|
279
|
+
}
|
|
242
280
|
var src_default = MarkdownDocsScraper;
|
|
243
281
|
export {
|
|
244
282
|
scrapeMarkdownDocs,
|
|
283
|
+
polymarketOptions,
|
|
284
|
+
parsePagePath,
|
|
285
|
+
fetchMarkdown,
|
|
286
|
+
extractTitle,
|
|
245
287
|
src_default as default,
|
|
246
|
-
|
|
288
|
+
claudeCodeOptions,
|
|
289
|
+
MarkdownDocsScraper,
|
|
290
|
+
GENERIC_PATTERN,
|
|
291
|
+
CLAUDE_CODE_PATTERN
|
|
247
292
|
};
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -19,12 +19,16 @@ program
|
|
|
19
19
|
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
20
20
|
.option("-c, --concurrency <num>", "Concurrency level", "5")
|
|
21
21
|
.option("--discover", "Discover pages before scraping", false)
|
|
22
|
+
.option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
|
|
23
|
+
.option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
|
|
22
24
|
.action(async (options) => {
|
|
23
25
|
const scraperOptions: ScraperOptions = {
|
|
24
26
|
baseUrl: options.url,
|
|
25
27
|
docsPath: options.docsPath,
|
|
26
28
|
outputDir: options.output,
|
|
27
29
|
concurrency: parseInt(options.concurrency),
|
|
30
|
+
llmsPaths: options.llmsPaths.split(","),
|
|
31
|
+
tryDocsSubdomain: !options.noSubdomain,
|
|
28
32
|
};
|
|
29
33
|
|
|
30
34
|
console.log(`🔍 Scraping ${options.url}...`);
|
|
@@ -56,10 +60,14 @@ program
|
|
|
56
60
|
.description("Discover all available documentation pages")
|
|
57
61
|
.requiredOption("-u, --url <url>", "Base URL of the documentation site")
|
|
58
62
|
.option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en")
|
|
63
|
+
.option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt")
|
|
64
|
+
.option("--no-subdomain", "Disable docs/doc subdomain fallback", false)
|
|
59
65
|
.action(async (options) => {
|
|
60
66
|
const scraper = new MarkdownDocsScraper({
|
|
61
67
|
baseUrl: options.url,
|
|
62
68
|
docsPath: options.docsPath,
|
|
69
|
+
llmsPaths: options.llmsPaths.split(","),
|
|
70
|
+
tryDocsSubdomain: !options.noSubdomain,
|
|
63
71
|
});
|
|
64
72
|
|
|
65
73
|
console.log(`🔍 Discovering pages from ${options.url}...`);
|
|
@@ -67,7 +75,8 @@ program
|
|
|
67
75
|
|
|
68
76
|
console.log(`\nFound ${pages.length} pages:\n`);
|
|
69
77
|
pages.forEach((page) => {
|
|
70
|
-
|
|
78
|
+
const path = page.category ? `${page.category}/${page.page}` : page.page;
|
|
79
|
+
console.log(` - ${path}`);
|
|
71
80
|
});
|
|
72
81
|
});
|
|
73
82
|
|
package/src/index.ts
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @ebowwa/markdown-docs-scraper
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Composable markdown documentation scraper.
|
|
5
|
+
* - Configurable llms.txt paths with fallbacks
|
|
6
|
+
* - Custom URL patterns for different doc sites
|
|
7
|
+
* - Works with any markdown documentation site
|
|
5
8
|
*/
|
|
6
9
|
|
|
7
10
|
// ============================================================================
|
|
@@ -23,6 +26,12 @@ export interface ScraperOptions {
|
|
|
23
26
|
outputDir?: string;
|
|
24
27
|
concurrency?: number;
|
|
25
28
|
onProgress?: (current: number, total: number) => void;
|
|
29
|
+
/** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
|
|
30
|
+
llmsPaths?: string[];
|
|
31
|
+
/** Also try docs subdomain variants (e.g., docs.example.com) */
|
|
32
|
+
tryDocsSubdomain?: boolean;
|
|
33
|
+
/** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
|
|
34
|
+
linkPattern?: RegExp;
|
|
26
35
|
}
|
|
27
36
|
|
|
28
37
|
export interface ScraperResult {
|
|
@@ -31,8 +40,66 @@ export interface ScraperResult {
|
|
|
31
40
|
duration: number;
|
|
32
41
|
}
|
|
33
42
|
|
|
43
|
+
/** Default pattern: matches /docs/en/ or /docs/ paths */
|
|
44
|
+
const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
|
|
45
|
+
|
|
46
|
+
/** Generic pattern: matches any .md links in llms.txt */
|
|
47
|
+
const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/([^)]+\.md))\)/g;
|
|
48
|
+
|
|
34
49
|
// ============================================================================
|
|
35
|
-
//
|
|
50
|
+
// UTILITY FUNCTIONS (Composable)
|
|
51
|
+
// ============================================================================
|
|
52
|
+
|
|
53
|
+
/** Extract title from markdown content */
|
|
54
|
+
export function extractTitle(markdown: string): string {
|
|
55
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
56
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Parse page path into category and page name */
|
|
60
|
+
export function parsePagePath(pagePath: string): { category: string; page: string } {
|
|
61
|
+
// Remove .md extension
|
|
62
|
+
const pageName = pagePath.replace(".md", "");
|
|
63
|
+
|
|
64
|
+
// Check if there's a category in the path
|
|
65
|
+
const pathParts = pageName.split("/");
|
|
66
|
+
|
|
67
|
+
if (pathParts.length === 1) {
|
|
68
|
+
return { category: "", page: pathParts[0] };
|
|
69
|
+
} else if (pathParts.length === 2) {
|
|
70
|
+
return { category: pathParts[0], page: pathParts[1] };
|
|
71
|
+
} else {
|
|
72
|
+
// Deeper path: join everything except last as category
|
|
73
|
+
return {
|
|
74
|
+
category: pathParts.slice(0, -1).join("/"),
|
|
75
|
+
page: pathParts[pathParts.length - 1],
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Fetch markdown content from URL */
|
|
81
|
+
export async function fetchMarkdown(url: string, userAgent = "@ebowwa/markdown-docs-scraper"): Promise<string | null> {
|
|
82
|
+
try {
|
|
83
|
+
const response = await fetch(url, {
|
|
84
|
+
headers: {
|
|
85
|
+
Accept: "text/markdown, text/plain",
|
|
86
|
+
"User-Agent": userAgent,
|
|
87
|
+
},
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
if (!response.ok) {
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return await response.text();
|
|
95
|
+
} catch (error) {
|
|
96
|
+
console.error(`Error fetching ${url}:`, error);
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// ============================================================================
|
|
102
|
+
// SCRAPER CLASS
|
|
36
103
|
// ============================================================================
|
|
37
104
|
|
|
38
105
|
export class MarkdownDocsScraper {
|
|
@@ -46,64 +113,23 @@ export class MarkdownDocsScraper {
|
|
|
46
113
|
outputDir: options.outputDir || "./docs",
|
|
47
114
|
concurrency: options.concurrency || 5,
|
|
48
115
|
onProgress: options.onProgress || (() => {}),
|
|
116
|
+
llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
|
|
117
|
+
tryDocsSubdomain: options.tryDocsSubdomain ?? true,
|
|
118
|
+
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
|
|
49
119
|
};
|
|
50
120
|
}
|
|
51
121
|
|
|
52
|
-
/**
|
|
53
|
-
* Fetch markdown content from a URL
|
|
54
|
-
*/
|
|
55
|
-
async fetchMarkdown(url: string): Promise<string | null> {
|
|
56
|
-
try {
|
|
57
|
-
const response = await fetch(url, {
|
|
58
|
-
headers: {
|
|
59
|
-
Accept: "text/markdown, text/plain",
|
|
60
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
61
|
-
},
|
|
62
|
-
});
|
|
63
|
-
|
|
64
|
-
if (!response.ok) {
|
|
65
|
-
return null;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
const contentType = response.headers.get("content-type") || "";
|
|
69
|
-
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
|
|
70
|
-
// Try to parse anyway - some sites return incorrect content-type
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
return await response.text();
|
|
74
|
-
} catch (error) {
|
|
75
|
-
console.error(`Error fetching ${url}:`, error);
|
|
76
|
-
return null;
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Extract title from markdown content
|
|
82
|
-
*/
|
|
83
|
-
extractTitle(markdown: string): string {
|
|
84
|
-
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
85
|
-
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
/**
|
|
89
|
-
* Sanitize filename from URL path
|
|
90
|
-
*/
|
|
91
|
-
sanitizeFilename(path: string): string {
|
|
92
|
-
return path
|
|
93
|
-
.toLowerCase()
|
|
94
|
-
.replace(/[^a-z0-9/]+/g, "-")
|
|
95
|
-
.replace(/^-|-$/g, "")
|
|
96
|
-
.replace(/\//g, "/");
|
|
97
|
-
}
|
|
98
|
-
|
|
99
122
|
/**
|
|
100
123
|
* Build URL for a documentation page
|
|
101
124
|
*/
|
|
102
125
|
buildUrl(category: string, page: string): string {
|
|
103
126
|
if (category) {
|
|
104
127
|
return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
|
|
105
|
-
} else {
|
|
128
|
+
} else if (this.options.docsPath) {
|
|
106
129
|
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
130
|
+
} else {
|
|
131
|
+
// No docsPath (like Polymarket) - direct path
|
|
132
|
+
return `${this.options.baseUrl}/${page}.md`;
|
|
107
133
|
}
|
|
108
134
|
}
|
|
109
135
|
|
|
@@ -112,7 +138,7 @@ export class MarkdownDocsScraper {
|
|
|
112
138
|
*/
|
|
113
139
|
async downloadPage(category: string, page: string): Promise<DocPage | null> {
|
|
114
140
|
const url = this.buildUrl(category, page);
|
|
115
|
-
const content = await
|
|
141
|
+
const content = await fetchMarkdown(url);
|
|
116
142
|
|
|
117
143
|
if (!content) {
|
|
118
144
|
return null;
|
|
@@ -120,13 +146,112 @@ export class MarkdownDocsScraper {
|
|
|
120
146
|
|
|
121
147
|
return {
|
|
122
148
|
url,
|
|
123
|
-
title:
|
|
149
|
+
title: extractTitle(content),
|
|
124
150
|
content,
|
|
125
151
|
category,
|
|
126
|
-
pageName: page,
|
|
152
|
+
pageName: page,
|
|
127
153
|
};
|
|
128
154
|
}
|
|
129
155
|
|
|
156
|
+
/**
|
|
157
|
+
* Generate possible llms.txt URLs to try
|
|
158
|
+
*/
|
|
159
|
+
private getLlmsUrls(): string[] {
|
|
160
|
+
const urls: string[] = [];
|
|
161
|
+
const baseUrl = this.options.baseUrl;
|
|
162
|
+
|
|
163
|
+
// Try configured/custom paths first
|
|
164
|
+
for (const path of this.options.llmsPaths) {
|
|
165
|
+
urls.push(`${baseUrl}${path}`);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Also try docs/doc subdomain variants if enabled
|
|
169
|
+
if (this.options.tryDocsSubdomain) {
|
|
170
|
+
try {
|
|
171
|
+
const url = new URL(baseUrl);
|
|
172
|
+
const hostname = url.hostname;
|
|
173
|
+
|
|
174
|
+
// Skip if already on docs/doc subdomain
|
|
175
|
+
if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
|
|
176
|
+
// Try docs.{domain}
|
|
177
|
+
const docsDomain = hostname.replace(/^www\./, "");
|
|
178
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
|
|
179
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
|
|
180
|
+
}
|
|
181
|
+
} catch {
|
|
182
|
+
// Invalid URL, skip subdomain variants
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return urls;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Fetch llms.txt from multiple possible URLs with fallback
|
|
191
|
+
*/
|
|
192
|
+
private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
|
|
193
|
+
const urls = this.getLlmsUrls();
|
|
194
|
+
|
|
195
|
+
for (const llmsUrl of urls) {
|
|
196
|
+
try {
|
|
197
|
+
const response = await fetch(llmsUrl, {
|
|
198
|
+
headers: {
|
|
199
|
+
Accept: "text/plain",
|
|
200
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
201
|
+
},
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
if (response.ok) {
|
|
205
|
+
const content = await response.text();
|
|
206
|
+
console.log(`Found llms.txt at ${llmsUrl}`);
|
|
207
|
+
return { content, url: llmsUrl };
|
|
208
|
+
}
|
|
209
|
+
} catch (error) {
|
|
210
|
+
// Try next URL
|
|
211
|
+
continue;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return null;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Discover pages from llms.txt index
|
|
220
|
+
*/
|
|
221
|
+
async discoverPages(): Promise<Array<{ category: string; page: string }>> {
|
|
222
|
+
const pages: Array<{ category: string; page: string }> = [];
|
|
223
|
+
|
|
224
|
+
try {
|
|
225
|
+
const llmsResult = await this.fetchLlmsTxt();
|
|
226
|
+
|
|
227
|
+
if (!llmsResult) {
|
|
228
|
+
const attemptedUrls = this.getLlmsUrls();
|
|
229
|
+
console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
|
|
230
|
+
return pages;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const { content } = llmsResult;
|
|
234
|
+
|
|
235
|
+
// Use provided pattern or default
|
|
236
|
+
const regex = new RegExp(this.options.linkPattern.source, this.options.linkPattern.flags);
|
|
237
|
+
let match;
|
|
238
|
+
|
|
239
|
+
while ((match = regex.exec(content)) !== null) {
|
|
240
|
+
const url = match[2];
|
|
241
|
+
const pagePath = match[3]; // The captured path group
|
|
242
|
+
|
|
243
|
+
const { category, page } = parsePagePath(pagePath);
|
|
244
|
+
pages.push({ category, page });
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
248
|
+
} catch (error) {
|
|
249
|
+
console.error("Error discovering pages:", error);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return pages;
|
|
253
|
+
}
|
|
254
|
+
|
|
130
255
|
/**
|
|
131
256
|
* Scrape pages discovered from llms.txt
|
|
132
257
|
*/
|
|
@@ -175,7 +300,7 @@ export class MarkdownDocsScraper {
|
|
|
175
300
|
}
|
|
176
301
|
|
|
177
302
|
/**
|
|
178
|
-
* Scrape all documentation pages
|
|
303
|
+
* Scrape all documentation pages (uses categories)
|
|
179
304
|
*/
|
|
180
305
|
async scrape(): Promise<ScraperResult> {
|
|
181
306
|
const startTime = Date.now();
|
|
@@ -255,102 +380,6 @@ export class MarkdownDocsScraper {
|
|
|
255
380
|
|
|
256
381
|
return pages;
|
|
257
382
|
}
|
|
258
|
-
|
|
259
|
-
/**
|
|
260
|
-
* Discover pages from llms.txt index
|
|
261
|
-
*/
|
|
262
|
-
async discoverPages(): Promise<Array<{ category: string; page: string }>> {
|
|
263
|
-
const pages: Array<{ category: string; page: string }> = [];
|
|
264
|
-
|
|
265
|
-
try {
|
|
266
|
-
const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
|
|
267
|
-
const response = await fetch(llmsUrl, {
|
|
268
|
-
headers: {
|
|
269
|
-
Accept: "text/plain",
|
|
270
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
271
|
-
},
|
|
272
|
-
});
|
|
273
|
-
|
|
274
|
-
if (!response.ok) {
|
|
275
|
-
console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
|
|
276
|
-
return pages;
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
const content = await response.text();
|
|
280
|
-
|
|
281
|
-
// Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
|
|
282
|
-
const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
283
|
-
let match;
|
|
284
|
-
|
|
285
|
-
while ((match = linkRegex.exec(content)) !== null) {
|
|
286
|
-
const url = match[2];
|
|
287
|
-
const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
|
|
288
|
-
|
|
289
|
-
// Remove .md extension
|
|
290
|
-
const pageName = pagePath.replace(".md", "");
|
|
291
|
-
|
|
292
|
-
// Check if there's a category in the path
|
|
293
|
-
const pathParts = pageName.split("/");
|
|
294
|
-
|
|
295
|
-
if (pathParts.length === 1) {
|
|
296
|
-
// No category: just "page-name"
|
|
297
|
-
pages.push({ category: "", page: pathParts[0] });
|
|
298
|
-
} else if (pathParts.length === 2) {
|
|
299
|
-
// Has category: "category/page-name"
|
|
300
|
-
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
301
|
-
} else {
|
|
302
|
-
// Deeper path: join everything except last as category
|
|
303
|
-
const category = pathParts.slice(0, -1).join("/");
|
|
304
|
-
const page = pathParts[pathParts.length - 1];
|
|
305
|
-
pages.push({ category, page });
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
310
|
-
} catch (error) {
|
|
311
|
-
console.error("Error discovering pages:", error);
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
return pages;
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
/**
|
|
318
|
-
* Discover additional pages by parsing the docs index (fallback)
|
|
319
|
-
*/
|
|
320
|
-
async discoverPagesHtml(): Promise<string[]> {
|
|
321
|
-
const discovered: string[] = [];
|
|
322
|
-
|
|
323
|
-
try {
|
|
324
|
-
const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
|
|
325
|
-
const response = await fetch(indexUrl, {
|
|
326
|
-
headers: {
|
|
327
|
-
Accept: "text/html",
|
|
328
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
329
|
-
},
|
|
330
|
-
});
|
|
331
|
-
|
|
332
|
-
if (!response.ok) {
|
|
333
|
-
return discovered;
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
const html = await response.text();
|
|
337
|
-
const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
338
|
-
let match;
|
|
339
|
-
|
|
340
|
-
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
341
|
-
const path = match[1];
|
|
342
|
-
if (!discovered.includes(path)) {
|
|
343
|
-
discovered.push(path);
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
console.log(`Discovered ${discovered.length} additional pages from HTML`);
|
|
348
|
-
} catch (error) {
|
|
349
|
-
console.error("Error discovering pages from HTML:", error);
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
return discovered;
|
|
353
|
-
}
|
|
354
383
|
}
|
|
355
384
|
|
|
356
385
|
// ============================================================================
|
|
@@ -375,6 +404,42 @@ export async function scrapeMarkdownDocs(
|
|
|
375
404
|
return result;
|
|
376
405
|
}
|
|
377
406
|
|
|
407
|
+
// ============================================================================
|
|
408
|
+
// PRESET CONFIGURATIONS (Composable)
|
|
409
|
+
// ============================================================================
|
|
410
|
+
|
|
411
|
+
/** Pattern for Claude Code docs: /docs/en/page.md */
|
|
412
|
+
export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
413
|
+
|
|
414
|
+
/** Pattern for generic docs: any domain/path.md */
|
|
415
|
+
export const GENERIC_PATTERN = GENERIC_LINK_PATTERN;
|
|
416
|
+
|
|
417
|
+
/** Create scraper options for Claude Code docs */
|
|
418
|
+
export function claudeCodeOptions(outputDir: string): ScraperOptions {
|
|
419
|
+
return {
|
|
420
|
+
baseUrl: "https://code.claude.com",
|
|
421
|
+
docsPath: "/docs/en",
|
|
422
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
423
|
+
linkPattern: CLAUDE_CODE_PATTERN,
|
|
424
|
+
outputDir,
|
|
425
|
+
concurrency: 10,
|
|
426
|
+
tryDocsSubdomain: false,
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
/** Create scraper options for Polymarket docs */
|
|
431
|
+
export function polymarketOptions(outputDir: string): ScraperOptions {
|
|
432
|
+
return {
|
|
433
|
+
baseUrl: "https://docs.polymarket.com",
|
|
434
|
+
docsPath: "",
|
|
435
|
+
llmsPaths: ["/llms.txt"],
|
|
436
|
+
linkPattern: GENERIC_PATTERN,
|
|
437
|
+
outputDir,
|
|
438
|
+
concurrency: 10,
|
|
439
|
+
tryDocsSubdomain: false,
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
|
|
378
443
|
// ============================================================================
|
|
379
444
|
// EXPORTS
|
|
380
445
|
// ============================================================================
|