@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +19 -13
- package/dist/index.d.ts +116 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +323 -105
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +428 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +12 -1
- package/src/index.js +487 -0
- package/src/index.ts +276 -158
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/dist/index.js
CHANGED
|
@@ -17,7 +17,199 @@ var __toESM = (mod, isNodeMode, target) => {
|
|
|
17
17
|
};
|
|
18
18
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
19
19
|
|
|
20
|
+
// src/scrapers/llms-txt.ts
|
|
21
|
+
var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
22
|
+
var GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
23
|
+
var llmsTxtScraper = {
|
|
24
|
+
type: "llms-txt",
|
|
25
|
+
async scrape(config) {
|
|
26
|
+
const options = getScraperOptions(config);
|
|
27
|
+
const result = await scrapeMarkdownDocs(options);
|
|
28
|
+
const downloaded = result.downloaded.map((page) => {
|
|
29
|
+
const category = page.category || "";
|
|
30
|
+
const filename = `${page.pageName || "untitled"}.md`;
|
|
31
|
+
const path = category ? `${category}/${filename}` : filename;
|
|
32
|
+
return {
|
|
33
|
+
success: true,
|
|
34
|
+
path,
|
|
35
|
+
title: page.title
|
|
36
|
+
};
|
|
37
|
+
});
|
|
38
|
+
return {
|
|
39
|
+
downloaded,
|
|
40
|
+
failed: result.failed,
|
|
41
|
+
duration: result.duration
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
function getScraperOptions(config) {
|
|
46
|
+
const baseOptions = {
|
|
47
|
+
baseUrl: config.baseUrl,
|
|
48
|
+
docsPath: config.docsPath,
|
|
49
|
+
outputDir: config.outputDir,
|
|
50
|
+
concurrency: 10,
|
|
51
|
+
useLlms: true,
|
|
52
|
+
tryDocsSubdomain: false
|
|
53
|
+
};
|
|
54
|
+
if (config.name === "Claude Code") {
|
|
55
|
+
return {
|
|
56
|
+
...baseOptions,
|
|
57
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
58
|
+
linkPattern: CLAUDE_CODE_PATTERN
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
if (config.name === "Polymarket") {
|
|
62
|
+
return {
|
|
63
|
+
...baseOptions,
|
|
64
|
+
llmsPaths: ["/llms.txt"],
|
|
65
|
+
linkPattern: GENERIC_PATTERN
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
if (config.name === "Bun") {
|
|
69
|
+
return {
|
|
70
|
+
...baseOptions,
|
|
71
|
+
llmsPaths: ["/docs/llms.txt", "/llms.txt"],
|
|
72
|
+
linkPattern: GENERIC_PATTERN
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
return {
|
|
76
|
+
...baseOptions,
|
|
77
|
+
llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
|
|
78
|
+
linkPattern: config.linkPattern || GENERIC_PATTERN
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
// src/scrapers/github-raw.ts
|
|
82
|
+
var githubRawScraper = {
|
|
83
|
+
type: "github-raw",
|
|
84
|
+
async scrape(config) {
|
|
85
|
+
const startTime = Date.now();
|
|
86
|
+
const downloaded = [];
|
|
87
|
+
const failed = [];
|
|
88
|
+
if (!config.github?.repo) {
|
|
89
|
+
throw new Error(`GitHub source "${config.name}" missing github.repo config`);
|
|
90
|
+
}
|
|
91
|
+
const files = await fetchGitHubMarkdownFiles(config.github.repo, config.docsPath.replace(/^\//, ""));
|
|
92
|
+
for (const file of files) {
|
|
93
|
+
const content = await fetchGitHubRawContent(config.github.repo, file.path);
|
|
94
|
+
if (content) {
|
|
95
|
+
downloaded.push({
|
|
96
|
+
success: true,
|
|
97
|
+
path: file.name,
|
|
98
|
+
title: extractTitle(content) || file.name.replace(".md", "")
|
|
99
|
+
});
|
|
100
|
+
await saveFile(config.outputDir, file.name, content);
|
|
101
|
+
} else {
|
|
102
|
+
failed.push({
|
|
103
|
+
url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
|
|
104
|
+
error: "Failed to fetch content"
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return {
|
|
109
|
+
downloaded,
|
|
110
|
+
failed,
|
|
111
|
+
duration: Date.now() - startTime
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
};
|
|
115
|
+
async function fetchGitHubMarkdownFiles(repo, path) {
|
|
116
|
+
const url = `https://api.github.com/repos/${repo}/contents/${path}`;
|
|
117
|
+
const response = await fetch(url, {
|
|
118
|
+
headers: {
|
|
119
|
+
Accept: "application/vnd.github.v3+json",
|
|
120
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
if (!response.ok) {
|
|
124
|
+
throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
|
|
125
|
+
}
|
|
126
|
+
const contents = await response.json();
|
|
127
|
+
return contents.filter((item) => item.type === "file" && item.name.endsWith(".md"));
|
|
128
|
+
}
|
|
129
|
+
async function fetchGitHubRawContent(repo, path) {
|
|
130
|
+
const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
|
|
131
|
+
try {
|
|
132
|
+
const response = await fetch(url, {
|
|
133
|
+
headers: {
|
|
134
|
+
Accept: "text/plain",
|
|
135
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
if (!response.ok) {
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
return await response.text();
|
|
142
|
+
} catch (error) {
|
|
143
|
+
console.error(`Error fetching ${url}:`, error);
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
function extractTitle(markdown) {
|
|
148
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
149
|
+
return titleMatch ? titleMatch[1].trim() : null;
|
|
150
|
+
}
|
|
151
|
+
async function saveFile(outputDir, filename, content) {
|
|
152
|
+
const fs = await import("fs/promises");
|
|
153
|
+
const path = await import("path");
|
|
154
|
+
const outputPath = path.join(outputDir, filename);
|
|
155
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
156
|
+
await fs.writeFile(outputPath, content, "utf-8");
|
|
157
|
+
}
|
|
158
|
+
// src/scrapers/registry.ts
|
|
159
|
+
var scrapers = new Map;
|
|
160
|
+
function registerScraper(scraper) {
|
|
161
|
+
scrapers.set(scraper.type, scraper);
|
|
162
|
+
}
|
|
163
|
+
function getScraper(type) {
|
|
164
|
+
return scrapers.get(type);
|
|
165
|
+
}
|
|
166
|
+
async function scrapeSource(config) {
|
|
167
|
+
const scraper = scrapers.get(config.sourceType);
|
|
168
|
+
if (!scraper) {
|
|
169
|
+
throw new Error(`No scraper registered for type: ${config.sourceType}`);
|
|
170
|
+
}
|
|
171
|
+
return scraper.scrape(config);
|
|
172
|
+
}
|
|
173
|
+
registerScraper(llmsTxtScraper);
|
|
174
|
+
registerScraper(githubRawScraper);
|
|
20
175
|
// src/index.ts
|
|
176
|
+
var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
177
|
+
function extractTitle2(markdown) {
|
|
178
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
179
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
180
|
+
}
|
|
181
|
+
function parsePagePath(pagePath) {
|
|
182
|
+
const pageName = pagePath.replace(".md", "");
|
|
183
|
+
const pathParts = pageName.split("/");
|
|
184
|
+
if (pathParts.length === 1) {
|
|
185
|
+
return { category: "", page: pathParts[0] };
|
|
186
|
+
} else if (pathParts.length === 2) {
|
|
187
|
+
return { category: pathParts[0], page: pathParts[1] };
|
|
188
|
+
} else {
|
|
189
|
+
return {
|
|
190
|
+
category: pathParts.slice(0, -1).join("/"),
|
|
191
|
+
page: pathParts[pathParts.length - 1]
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
async function fetchMarkdown(url, userAgent = "@ebowwa/markdown-docs-scraper") {
|
|
196
|
+
try {
|
|
197
|
+
const response = await fetch(url, {
|
|
198
|
+
headers: {
|
|
199
|
+
Accept: "text/markdown, text/plain",
|
|
200
|
+
"User-Agent": userAgent
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
if (!response.ok) {
|
|
204
|
+
return null;
|
|
205
|
+
}
|
|
206
|
+
return await response.text();
|
|
207
|
+
} catch (error) {
|
|
208
|
+
console.error(`Error fetching ${url}:`, error);
|
|
209
|
+
return null;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
21
213
|
class MarkdownDocsScraper {
|
|
22
214
|
options;
|
|
23
215
|
constructor(options) {
|
|
@@ -27,56 +219,107 @@ class MarkdownDocsScraper {
|
|
|
27
219
|
categories: options.categories || {},
|
|
28
220
|
outputDir: options.outputDir || "./docs",
|
|
29
221
|
concurrency: options.concurrency || 5,
|
|
30
|
-
onProgress: options.onProgress || (() => {})
|
|
222
|
+
onProgress: options.onProgress || (() => {}),
|
|
223
|
+
llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
|
|
224
|
+
tryDocsSubdomain: options.tryDocsSubdomain ?? true,
|
|
225
|
+
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
|
|
226
|
+
useDirectUrls: options.useDirectUrls ?? true
|
|
31
227
|
};
|
|
32
228
|
}
|
|
33
|
-
async fetchMarkdown(url) {
|
|
34
|
-
try {
|
|
35
|
-
const response = await fetch(url, {
|
|
36
|
-
headers: {
|
|
37
|
-
Accept: "text/markdown, text/plain",
|
|
38
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
39
|
-
}
|
|
40
|
-
});
|
|
41
|
-
if (!response.ok) {
|
|
42
|
-
return null;
|
|
43
|
-
}
|
|
44
|
-
const contentType = response.headers.get("content-type") || "";
|
|
45
|
-
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {}
|
|
46
|
-
return await response.text();
|
|
47
|
-
} catch (error) {
|
|
48
|
-
console.error(`Error fetching ${url}:`, error);
|
|
49
|
-
return null;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
extractTitle(markdown) {
|
|
53
|
-
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
54
|
-
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
55
|
-
}
|
|
56
|
-
sanitizeFilename(path) {
|
|
57
|
-
return path.toLowerCase().replace(/[^a-z0-9/]+/g, "-").replace(/^-|-$/g, "").replace(/\//g, "/");
|
|
58
|
-
}
|
|
59
229
|
buildUrl(category, page) {
|
|
60
230
|
if (category) {
|
|
61
231
|
return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
|
|
62
|
-
} else {
|
|
232
|
+
} else if (this.options.docsPath) {
|
|
63
233
|
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
234
|
+
} else {
|
|
235
|
+
return `${this.options.baseUrl}/${page}.md`;
|
|
64
236
|
}
|
|
65
237
|
}
|
|
66
|
-
async downloadPage(
|
|
67
|
-
const url = this.buildUrl(category, page);
|
|
68
|
-
const content = await
|
|
238
|
+
async downloadPage(pageInfo) {
|
|
239
|
+
const url = this.options.useDirectUrls && pageInfo.fullUrl ? pageInfo.fullUrl : this.buildUrl(pageInfo.category, pageInfo.page);
|
|
240
|
+
const content = await fetchMarkdown(url);
|
|
69
241
|
if (!content) {
|
|
70
242
|
return null;
|
|
71
243
|
}
|
|
72
244
|
return {
|
|
73
245
|
url,
|
|
74
|
-
title:
|
|
246
|
+
title: extractTitle2(content),
|
|
75
247
|
content,
|
|
76
|
-
category,
|
|
77
|
-
pageName: page
|
|
248
|
+
category: pageInfo.category,
|
|
249
|
+
pageName: pageInfo.page
|
|
78
250
|
};
|
|
79
251
|
}
|
|
252
|
+
getLlmsUrls() {
|
|
253
|
+
const urls = [];
|
|
254
|
+
const baseUrl = this.options.baseUrl;
|
|
255
|
+
for (const path of this.options.llmsPaths) {
|
|
256
|
+
urls.push(`${baseUrl}${path}`);
|
|
257
|
+
}
|
|
258
|
+
if (this.options.tryDocsSubdomain) {
|
|
259
|
+
try {
|
|
260
|
+
const url = new URL(baseUrl);
|
|
261
|
+
const hostname = url.hostname;
|
|
262
|
+
if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
|
|
263
|
+
const docsDomain = hostname.replace(/^www\./, "");
|
|
264
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
|
|
265
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
|
|
266
|
+
}
|
|
267
|
+
} catch {}
|
|
268
|
+
}
|
|
269
|
+
return urls;
|
|
270
|
+
}
|
|
271
|
+
async fetchLlmsTxt() {
|
|
272
|
+
const urls = this.getLlmsUrls();
|
|
273
|
+
console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
|
|
274
|
+
for (const llmsUrl of urls) {
|
|
275
|
+
try {
|
|
276
|
+
console.log(`DEBUG: Fetching ${llmsUrl}...`);
|
|
277
|
+
const response = await fetch(llmsUrl, {
|
|
278
|
+
headers: {
|
|
279
|
+
Accept: "text/plain",
|
|
280
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
281
|
+
}
|
|
282
|
+
});
|
|
283
|
+
console.log(`DEBUG: Response status: ${response.status}`);
|
|
284
|
+
if (response.ok) {
|
|
285
|
+
const content = await response.text();
|
|
286
|
+
console.log(`Found llms.txt at ${llmsUrl}`);
|
|
287
|
+
return { content, url: llmsUrl };
|
|
288
|
+
}
|
|
289
|
+
} catch (error) {
|
|
290
|
+
console.log(`DEBUG: Error: ${error}`);
|
|
291
|
+
continue;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
return null;
|
|
295
|
+
}
|
|
296
|
+
async discoverPages() {
|
|
297
|
+
const pages = [];
|
|
298
|
+
try {
|
|
299
|
+
const llmsResult = await this.fetchLlmsTxt();
|
|
300
|
+
if (!llmsResult) {
|
|
301
|
+
const attemptedUrls = this.getLlmsUrls();
|
|
302
|
+
console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
|
|
303
|
+
return pages;
|
|
304
|
+
}
|
|
305
|
+
const { content } = llmsResult;
|
|
306
|
+
const pattern = this.options.linkPattern;
|
|
307
|
+
const regex = new RegExp(pattern.source, pattern.flags);
|
|
308
|
+
let match;
|
|
309
|
+
console.log(`DEBUG: Using pattern: ${pattern.source}`);
|
|
310
|
+
console.log(`DEBUG: Content length: ${content.length}`);
|
|
311
|
+
while ((match = regex.exec(content)) !== null) {
|
|
312
|
+
const fullUrl = match[2];
|
|
313
|
+
const pagePath = match[3];
|
|
314
|
+
const { category, page } = parsePagePath(pagePath);
|
|
315
|
+
pages.push({ category, page, fullUrl });
|
|
316
|
+
}
|
|
317
|
+
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
318
|
+
} catch (error) {
|
|
319
|
+
console.error("Error discovering pages:", error);
|
|
320
|
+
}
|
|
321
|
+
return pages;
|
|
322
|
+
}
|
|
80
323
|
async scrapeFromLlms() {
|
|
81
324
|
const startTime = Date.now();
|
|
82
325
|
const downloaded = [];
|
|
@@ -89,14 +332,15 @@ class MarkdownDocsScraper {
|
|
|
89
332
|
console.log(`Scraping ${pages.length} discovered pages...`);
|
|
90
333
|
for (let i = 0;i < pages.length; i += this.options.concurrency) {
|
|
91
334
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
92
|
-
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page
|
|
335
|
+
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page)));
|
|
93
336
|
results.forEach((result, index) => {
|
|
94
337
|
const page = batch[index];
|
|
95
338
|
if (result.status === "fulfilled" && result.value) {
|
|
96
339
|
downloaded.push(result.value);
|
|
97
340
|
} else {
|
|
341
|
+
const url = this.options.useDirectUrls && page.fullUrl ? page.fullUrl : this.buildUrl(page.category, page.page);
|
|
98
342
|
failed.push({
|
|
99
|
-
url
|
|
343
|
+
url,
|
|
100
344
|
error: result.status === "rejected" ? result.reason : "Not found"
|
|
101
345
|
});
|
|
102
346
|
}
|
|
@@ -118,7 +362,7 @@ class MarkdownDocsScraper {
|
|
|
118
362
|
console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
|
|
119
363
|
for (let i = 0;i < pages.length; i += this.options.concurrency) {
|
|
120
364
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
121
|
-
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page
|
|
365
|
+
const results = await Promise.allSettled(batch.map((page) => this.downloadPage({ ...page, fullUrl: "" })));
|
|
122
366
|
results.forEach((result, index) => {
|
|
123
367
|
const page = batch[index];
|
|
124
368
|
if (result.status === "fulfilled" && result.value) {
|
|
@@ -159,77 +403,11 @@ Downloaded: ${new Date().toISOString()}
|
|
|
159
403
|
const pages = [];
|
|
160
404
|
for (const [category, pageList] of Object.entries(this.options.categories)) {
|
|
161
405
|
for (const page of pageList) {
|
|
162
|
-
pages.push({ category, page });
|
|
406
|
+
pages.push({ category, page, fullUrl: "" });
|
|
163
407
|
}
|
|
164
408
|
}
|
|
165
409
|
return pages;
|
|
166
410
|
}
|
|
167
|
-
async discoverPages() {
|
|
168
|
-
const pages = [];
|
|
169
|
-
try {
|
|
170
|
-
const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
|
|
171
|
-
const response = await fetch(llmsUrl, {
|
|
172
|
-
headers: {
|
|
173
|
-
Accept: "text/plain",
|
|
174
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
175
|
-
}
|
|
176
|
-
});
|
|
177
|
-
if (!response.ok) {
|
|
178
|
-
console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
|
|
179
|
-
return pages;
|
|
180
|
-
}
|
|
181
|
-
const content = await response.text();
|
|
182
|
-
const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
183
|
-
let match;
|
|
184
|
-
while ((match = linkRegex.exec(content)) !== null) {
|
|
185
|
-
const url = match[2];
|
|
186
|
-
const pagePath = match[3];
|
|
187
|
-
const pageName = pagePath.replace(".md", "");
|
|
188
|
-
const pathParts = pageName.split("/");
|
|
189
|
-
if (pathParts.length === 1) {
|
|
190
|
-
pages.push({ category: "", page: pathParts[0] });
|
|
191
|
-
} else if (pathParts.length === 2) {
|
|
192
|
-
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
193
|
-
} else {
|
|
194
|
-
const category = pathParts.slice(0, -1).join("/");
|
|
195
|
-
const page = pathParts[pathParts.length - 1];
|
|
196
|
-
pages.push({ category, page });
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
200
|
-
} catch (error) {
|
|
201
|
-
console.error("Error discovering pages:", error);
|
|
202
|
-
}
|
|
203
|
-
return pages;
|
|
204
|
-
}
|
|
205
|
-
async discoverPagesHtml() {
|
|
206
|
-
const discovered = [];
|
|
207
|
-
try {
|
|
208
|
-
const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
|
|
209
|
-
const response = await fetch(indexUrl, {
|
|
210
|
-
headers: {
|
|
211
|
-
Accept: "text/html",
|
|
212
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
213
|
-
}
|
|
214
|
-
});
|
|
215
|
-
if (!response.ok) {
|
|
216
|
-
return discovered;
|
|
217
|
-
}
|
|
218
|
-
const html = await response.text();
|
|
219
|
-
const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
220
|
-
let match;
|
|
221
|
-
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
222
|
-
const path = match[1];
|
|
223
|
-
if (!discovered.includes(path)) {
|
|
224
|
-
discovered.push(path);
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
console.log(`Discovered ${discovered.length} additional pages from HTML`);
|
|
228
|
-
} catch (error) {
|
|
229
|
-
console.error("Error discovering pages from HTML:", error);
|
|
230
|
-
}
|
|
231
|
-
return discovered;
|
|
232
|
-
}
|
|
233
411
|
}
|
|
234
412
|
async function scrapeMarkdownDocs(options) {
|
|
235
413
|
const scraper = new MarkdownDocsScraper(options);
|
|
@@ -239,9 +417,49 @@ async function scrapeMarkdownDocs(options) {
|
|
|
239
417
|
}
|
|
240
418
|
return result;
|
|
241
419
|
}
|
|
420
|
+
var CLAUDE_CODE_PATTERN2 = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
421
|
+
var GENERIC_PATTERN2 = GENERIC_LINK_PATTERN;
|
|
422
|
+
function claudeCodeOptions(outputDir) {
|
|
423
|
+
return {
|
|
424
|
+
baseUrl: "https://code.claude.com",
|
|
425
|
+
docsPath: "/docs/en",
|
|
426
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
427
|
+
linkPattern: CLAUDE_CODE_PATTERN2,
|
|
428
|
+
outputDir,
|
|
429
|
+
concurrency: 10,
|
|
430
|
+
tryDocsSubdomain: false,
|
|
431
|
+
useDirectUrls: false
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
function polymarketOptions(outputDir) {
|
|
435
|
+
return {
|
|
436
|
+
baseUrl: "https://docs.polymarket.com",
|
|
437
|
+
docsPath: "",
|
|
438
|
+
llmsPaths: ["/llms.txt"],
|
|
439
|
+
linkPattern: GENERIC_PATTERN2,
|
|
440
|
+
outputDir,
|
|
441
|
+
concurrency: 10,
|
|
442
|
+
tryDocsSubdomain: false,
|
|
443
|
+
useDirectUrls: true
|
|
444
|
+
};
|
|
445
|
+
}
|
|
242
446
|
var src_default = MarkdownDocsScraper;
|
|
243
447
|
export {
|
|
448
|
+
scrapeSource,
|
|
244
449
|
scrapeMarkdownDocs,
|
|
450
|
+
registerScraper,
|
|
451
|
+
polymarketOptions,
|
|
452
|
+
parsePagePath,
|
|
453
|
+
llmsTxtScraper,
|
|
454
|
+
githubRawScraper,
|
|
455
|
+
getScraper,
|
|
456
|
+
fetchMarkdown,
|
|
457
|
+
extractTitle2 as extractTitle,
|
|
245
458
|
src_default as default,
|
|
246
|
-
|
|
459
|
+
claudeCodeOptions,
|
|
460
|
+
GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
|
|
461
|
+
CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
|
|
462
|
+
MarkdownDocsScraper,
|
|
463
|
+
GENERIC_PATTERN2 as GENERIC_PATTERN,
|
|
464
|
+
CLAUDE_CODE_PATTERN2 as CLAUDE_CODE_PATTERN
|
|
247
465
|
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Raw Scraper
|
|
3
|
+
*
|
|
4
|
+
* Downloads markdown files directly from GitHub repositories via raw content URLs.
|
|
5
|
+
* Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
|
|
6
|
+
*/
|
|
7
|
+
import type { Scraper } from "./types";
|
|
8
|
+
export declare const githubRawScraper: Scraper;
|
|
9
|
+
//# sourceMappingURL=github-raw.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github-raw.d.ts","sourceRoot":"","sources":["../../src/scrapers/github-raw.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,OAAO,EAA8C,MAAM,SAAS,CAAC;AAiBnF,eAAO,MAAM,gBAAgB,EAAE,OA6C9B,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scrapers Module
|
|
3
|
+
*
|
|
4
|
+
* Composable scraper architecture for multiple documentation source types.
|
|
5
|
+
* This module provides a registry-based system for different scraper implementations.
|
|
6
|
+
*/
|
|
7
|
+
export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
|
|
8
|
+
export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
|
|
9
|
+
export { githubRawScraper } from "./github-raw";
|
|
10
|
+
export { registerScraper, getScraper, scrapeSource } from "./registry";
|
|
11
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scrapers/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,YAAY,EAAE,UAAU,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AAG/F,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAClF,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAGhD,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC"}
|