@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +19 -13
- package/dist/index.d.ts +116 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +323 -105
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +428 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +12 -1
- package/src/index.js +487 -0
- package/src/index.ts +276 -158
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/src/index.ts
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @ebowwa/markdown-docs-scraper
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Composable markdown documentation scraper.
|
|
5
|
+
* - Configurable llms.txt paths with fallbacks
|
|
6
|
+
* - Custom URL patterns for different doc sites
|
|
7
|
+
* - Works with any markdown documentation site
|
|
8
|
+
* - Uses full URLs from llms.txt directly
|
|
5
9
|
*/
|
|
6
10
|
|
|
7
11
|
// ============================================================================
|
|
@@ -23,6 +27,14 @@ export interface ScraperOptions {
|
|
|
23
27
|
outputDir?: string;
|
|
24
28
|
concurrency?: number;
|
|
25
29
|
onProgress?: (current: number, total: number) => void;
|
|
30
|
+
/** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
|
|
31
|
+
llmsPaths?: string[];
|
|
32
|
+
/** Also try docs subdomain variants (e.g., docs.example.com) */
|
|
33
|
+
tryDocsSubdomain?: boolean;
|
|
34
|
+
/** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
|
|
35
|
+
linkPattern?: RegExp;
|
|
36
|
+
/** Use full URLs from llms.txt directly (default: true for generic pattern) */
|
|
37
|
+
useDirectUrls?: boolean;
|
|
26
38
|
}
|
|
27
39
|
|
|
28
40
|
export interface ScraperResult {
|
|
@@ -31,8 +43,73 @@ export interface ScraperResult {
|
|
|
31
43
|
duration: number;
|
|
32
44
|
}
|
|
33
45
|
|
|
46
|
+
/** Discovered page with full URL */
|
|
47
|
+
interface DiscoveredPage {
|
|
48
|
+
category: string;
|
|
49
|
+
page: string;
|
|
50
|
+
fullUrl: string; // The complete URL from llms.txt
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Default pattern: matches /docs/en/ or /docs/ paths */
|
|
54
|
+
const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
|
|
55
|
+
|
|
56
|
+
/** Generic pattern: matches any .md links - captures full path after domain */
|
|
57
|
+
const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
58
|
+
|
|
59
|
+
// ============================================================================
|
|
60
|
+
// UTILITY FUNCTIONS (Composable)
|
|
61
|
+
// ============================================================================
|
|
62
|
+
|
|
63
|
+
/** Extract title from markdown content */
|
|
64
|
+
export function extractTitle(markdown: string): string {
|
|
65
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
66
|
+
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Parse page path into category and page name */
|
|
70
|
+
export function parsePagePath(pagePath: string): { category: string; page: string } {
|
|
71
|
+
// Remove .md extension
|
|
72
|
+
const pageName = pagePath.replace(".md", "");
|
|
73
|
+
|
|
74
|
+
// Check if there's a category in the path
|
|
75
|
+
const pathParts = pageName.split("/");
|
|
76
|
+
|
|
77
|
+
if (pathParts.length === 1) {
|
|
78
|
+
return { category: "", page: pathParts[0] };
|
|
79
|
+
} else if (pathParts.length === 2) {
|
|
80
|
+
return { category: pathParts[0], page: pathParts[1] };
|
|
81
|
+
} else {
|
|
82
|
+
// Deeper path: join everything except last as category
|
|
83
|
+
return {
|
|
84
|
+
category: pathParts.slice(0, -1).join("/"),
|
|
85
|
+
page: pathParts[pathParts.length - 1],
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/** Fetch markdown content from URL */
|
|
91
|
+
export async function fetchMarkdown(url: string, userAgent = "@ebowwa/markdown-docs-scraper"): Promise<string | null> {
|
|
92
|
+
try {
|
|
93
|
+
const response = await fetch(url, {
|
|
94
|
+
headers: {
|
|
95
|
+
Accept: "text/markdown, text/plain",
|
|
96
|
+
"User-Agent": userAgent,
|
|
97
|
+
},
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
if (!response.ok) {
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return await response.text();
|
|
105
|
+
} catch (error) {
|
|
106
|
+
console.error(`Error fetching ${url}:`, error);
|
|
107
|
+
return null;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
34
111
|
// ============================================================================
|
|
35
|
-
// SCRAPER
|
|
112
|
+
// SCRAPER CLASS
|
|
36
113
|
// ============================================================================
|
|
37
114
|
|
|
38
115
|
export class MarkdownDocsScraper {
|
|
@@ -46,85 +123,154 @@ export class MarkdownDocsScraper {
|
|
|
46
123
|
outputDir: options.outputDir || "./docs",
|
|
47
124
|
concurrency: options.concurrency || 5,
|
|
48
125
|
onProgress: options.onProgress || (() => {}),
|
|
126
|
+
llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
|
|
127
|
+
tryDocsSubdomain: options.tryDocsSubdomain ?? true,
|
|
128
|
+
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
|
|
129
|
+
useDirectUrls: options.useDirectUrls ?? true,
|
|
49
130
|
};
|
|
50
131
|
}
|
|
51
132
|
|
|
52
133
|
/**
|
|
53
|
-
*
|
|
134
|
+
* Build URL for a documentation page (fallback when no direct URL)
|
|
54
135
|
*/
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
136
|
+
buildUrl(category: string, page: string): string {
|
|
137
|
+
if (category) {
|
|
138
|
+
return `${this.options.baseUrl}${this.options.docsPath}/${category}/${page}.md`;
|
|
139
|
+
} else if (this.options.docsPath) {
|
|
140
|
+
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
141
|
+
} else {
|
|
142
|
+
return `${this.options.baseUrl}/${page}.md`;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
63
145
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
146
|
+
/**
|
|
147
|
+
* Download a page using either direct URL or built URL
|
|
148
|
+
*/
|
|
149
|
+
async downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null> {
|
|
150
|
+
// Use direct URL if available and useDirectUrls is enabled
|
|
151
|
+
const url = (this.options.useDirectUrls && pageInfo.fullUrl)
|
|
152
|
+
? pageInfo.fullUrl
|
|
153
|
+
: this.buildUrl(pageInfo.category, pageInfo.page);
|
|
67
154
|
|
|
68
|
-
|
|
69
|
-
if (!contentType.includes("markdown") && !contentType.includes("text/plain")) {
|
|
70
|
-
// Try to parse anyway - some sites return incorrect content-type
|
|
71
|
-
}
|
|
155
|
+
const content = await fetchMarkdown(url);
|
|
72
156
|
|
|
73
|
-
|
|
74
|
-
} catch (error) {
|
|
75
|
-
console.error(`Error fetching ${url}:`, error);
|
|
157
|
+
if (!content) {
|
|
76
158
|
return null;
|
|
77
159
|
}
|
|
78
|
-
}
|
|
79
160
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
161
|
+
return {
|
|
162
|
+
url,
|
|
163
|
+
title: extractTitle(content),
|
|
164
|
+
content,
|
|
165
|
+
category: pageInfo.category,
|
|
166
|
+
pageName: pageInfo.page,
|
|
167
|
+
};
|
|
86
168
|
}
|
|
87
169
|
|
|
88
170
|
/**
|
|
89
|
-
*
|
|
171
|
+
* Generate possible llms.txt URLs to try
|
|
90
172
|
*/
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
173
|
+
private getLlmsUrls(): string[] {
|
|
174
|
+
const urls: string[] = [];
|
|
175
|
+
const baseUrl = this.options.baseUrl;
|
|
176
|
+
|
|
177
|
+
// Try configured/custom paths first
|
|
178
|
+
for (const path of this.options.llmsPaths) {
|
|
179
|
+
urls.push(`${baseUrl}${path}`);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Also try docs/doc subdomain variants if enabled
|
|
183
|
+
if (this.options.tryDocsSubdomain) {
|
|
184
|
+
try {
|
|
185
|
+
const url = new URL(baseUrl);
|
|
186
|
+
const hostname = url.hostname;
|
|
187
|
+
|
|
188
|
+
// Skip if already on docs/doc subdomain
|
|
189
|
+
if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
|
|
190
|
+
const docsDomain = hostname.replace(/^www\./, "");
|
|
191
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
|
|
192
|
+
urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
|
|
193
|
+
}
|
|
194
|
+
} catch {
|
|
195
|
+
// Invalid URL, skip subdomain variants
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
return urls;
|
|
97
200
|
}
|
|
98
201
|
|
|
99
202
|
/**
|
|
100
|
-
*
|
|
203
|
+
* Fetch llms.txt from multiple possible URLs with fallback
|
|
101
204
|
*/
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
205
|
+
private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
|
|
206
|
+
const urls = this.getLlmsUrls();
|
|
207
|
+
console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
|
|
208
|
+
|
|
209
|
+
for (const llmsUrl of urls) {
|
|
210
|
+
try {
|
|
211
|
+
console.log(`DEBUG: Fetching ${llmsUrl}...`);
|
|
212
|
+
const response = await fetch(llmsUrl, {
|
|
213
|
+
headers: {
|
|
214
|
+
Accept: "text/plain",
|
|
215
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
216
|
+
},
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
console.log(`DEBUG: Response status: ${response.status}`);
|
|
220
|
+
if (response.ok) {
|
|
221
|
+
const content = await response.text();
|
|
222
|
+
console.log(`Found llms.txt at ${llmsUrl}`);
|
|
223
|
+
return { content, url: llmsUrl };
|
|
224
|
+
}
|
|
225
|
+
} catch (error) {
|
|
226
|
+
console.log(`DEBUG: Error: ${error}`);
|
|
227
|
+
continue;
|
|
228
|
+
}
|
|
107
229
|
}
|
|
230
|
+
|
|
231
|
+
return null;
|
|
108
232
|
}
|
|
109
233
|
|
|
110
234
|
/**
|
|
111
|
-
*
|
|
235
|
+
* Discover pages from llms.txt index
|
|
112
236
|
*/
|
|
113
|
-
async
|
|
114
|
-
const
|
|
115
|
-
const content = await this.fetchMarkdown(url);
|
|
237
|
+
async discoverPages(): Promise<DiscoveredPage[]> {
|
|
238
|
+
const pages: DiscoveredPage[] = [];
|
|
116
239
|
|
|
117
|
-
|
|
118
|
-
|
|
240
|
+
try {
|
|
241
|
+
const llmsResult = await this.fetchLlmsTxt();
|
|
242
|
+
|
|
243
|
+
if (!llmsResult) {
|
|
244
|
+
const attemptedUrls = this.getLlmsUrls();
|
|
245
|
+
console.warn(`Could not fetch llms.txt from any of: ${attemptedUrls.join(", ")}`);
|
|
246
|
+
return pages;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
const { content } = llmsResult;
|
|
250
|
+
|
|
251
|
+
// Use provided pattern or default
|
|
252
|
+
const pattern = this.options.linkPattern;
|
|
253
|
+
const regex = new RegExp(pattern.source, pattern.flags);
|
|
254
|
+
let match;
|
|
255
|
+
|
|
256
|
+
// Debug: log pattern being used
|
|
257
|
+
console.log(`DEBUG: Using pattern: ${pattern.source}`);
|
|
258
|
+
console.log(`DEBUG: Content length: ${content.length}`);
|
|
259
|
+
|
|
260
|
+
while ((match = regex.exec(content)) !== null) {
|
|
261
|
+
const fullUrl = match[2]; // The full URL from llms.txt
|
|
262
|
+
const pagePath = match[3]; // The captured path group
|
|
263
|
+
|
|
264
|
+
const { category, page } = parsePagePath(pagePath);
|
|
265
|
+
pages.push({ category, page, fullUrl });
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
269
|
+
} catch (error) {
|
|
270
|
+
console.error("Error discovering pages:", error);
|
|
119
271
|
}
|
|
120
272
|
|
|
121
|
-
return
|
|
122
|
-
url,
|
|
123
|
-
title: this.extractTitle(content),
|
|
124
|
-
content,
|
|
125
|
-
category,
|
|
126
|
-
pageName: page, // Store the page name for saving
|
|
127
|
-
};
|
|
273
|
+
return pages;
|
|
128
274
|
}
|
|
129
275
|
|
|
130
276
|
/**
|
|
@@ -148,7 +294,7 @@ export class MarkdownDocsScraper {
|
|
|
148
294
|
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
149
295
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
150
296
|
const results = await Promise.allSettled(
|
|
151
|
-
batch.map((page) => this.downloadPage(page
|
|
297
|
+
batch.map((page) => this.downloadPage(page))
|
|
152
298
|
);
|
|
153
299
|
|
|
154
300
|
results.forEach((result, index) => {
|
|
@@ -156,8 +302,11 @@ export class MarkdownDocsScraper {
|
|
|
156
302
|
if (result.status === "fulfilled" && result.value) {
|
|
157
303
|
downloaded.push(result.value);
|
|
158
304
|
} else {
|
|
305
|
+
const url = (this.options.useDirectUrls && page.fullUrl)
|
|
306
|
+
? page.fullUrl
|
|
307
|
+
: this.buildUrl(page.category, page.page);
|
|
159
308
|
failed.push({
|
|
160
|
-
url
|
|
309
|
+
url,
|
|
161
310
|
error: result.status === "rejected" ? (result.reason as string) : "Not found",
|
|
162
311
|
});
|
|
163
312
|
}
|
|
@@ -175,7 +324,7 @@ export class MarkdownDocsScraper {
|
|
|
175
324
|
}
|
|
176
325
|
|
|
177
326
|
/**
|
|
178
|
-
* Scrape all documentation pages
|
|
327
|
+
* Scrape all documentation pages (uses categories)
|
|
179
328
|
*/
|
|
180
329
|
async scrape(): Promise<ScraperResult> {
|
|
181
330
|
const startTime = Date.now();
|
|
@@ -191,7 +340,7 @@ export class MarkdownDocsScraper {
|
|
|
191
340
|
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
192
341
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
193
342
|
const results = await Promise.allSettled(
|
|
194
|
-
batch.map((page) => this.downloadPage(page
|
|
343
|
+
batch.map((page) => this.downloadPage({ ...page, fullUrl: "" }))
|
|
195
344
|
);
|
|
196
345
|
|
|
197
346
|
results.forEach((result, index) => {
|
|
@@ -225,7 +374,6 @@ export class MarkdownDocsScraper {
|
|
|
225
374
|
const path = await import("path");
|
|
226
375
|
|
|
227
376
|
for (const page of pages) {
|
|
228
|
-
// Use pageName if available, otherwise extract from URL
|
|
229
377
|
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
230
378
|
|
|
231
379
|
const dir = page.category
|
|
@@ -244,113 +392,17 @@ export class MarkdownDocsScraper {
|
|
|
244
392
|
/**
|
|
245
393
|
* Get list of pages to scrape based on categories
|
|
246
394
|
*/
|
|
247
|
-
private getPagesToScrape():
|
|
248
|
-
const pages:
|
|
395
|
+
private getPagesToScrape(): DiscoveredPage[] {
|
|
396
|
+
const pages: DiscoveredPage[] = [];
|
|
249
397
|
|
|
250
398
|
for (const [category, pageList] of Object.entries(this.options.categories)) {
|
|
251
399
|
for (const page of pageList) {
|
|
252
|
-
pages.push({ category, page });
|
|
400
|
+
pages.push({ category, page, fullUrl: "" });
|
|
253
401
|
}
|
|
254
402
|
}
|
|
255
403
|
|
|
256
404
|
return pages;
|
|
257
405
|
}
|
|
258
|
-
|
|
259
|
-
/**
|
|
260
|
-
* Discover pages from llms.txt index
|
|
261
|
-
*/
|
|
262
|
-
async discoverPages(): Promise<Array<{ category: string; page: string }>> {
|
|
263
|
-
const pages: Array<{ category: string; page: string }> = [];
|
|
264
|
-
|
|
265
|
-
try {
|
|
266
|
-
const llmsUrl = `${this.options.baseUrl}/docs/llms.txt`;
|
|
267
|
-
const response = await fetch(llmsUrl, {
|
|
268
|
-
headers: {
|
|
269
|
-
Accept: "text/plain",
|
|
270
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
271
|
-
},
|
|
272
|
-
});
|
|
273
|
-
|
|
274
|
-
if (!response.ok) {
|
|
275
|
-
console.warn(`Could not fetch llms.txt from ${llmsUrl}`);
|
|
276
|
-
return pages;
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
const content = await response.text();
|
|
280
|
-
|
|
281
|
-
// Parse markdown links in format: [title](https://code.claude.com/docs/en/page.md)
|
|
282
|
-
const linkRegex = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
283
|
-
let match;
|
|
284
|
-
|
|
285
|
-
while ((match = linkRegex.exec(content)) !== null) {
|
|
286
|
-
const url = match[2];
|
|
287
|
-
const pagePath = match[3]; // e.g., "agent-teams.md" or "category/page.md"
|
|
288
|
-
|
|
289
|
-
// Remove .md extension
|
|
290
|
-
const pageName = pagePath.replace(".md", "");
|
|
291
|
-
|
|
292
|
-
// Check if there's a category in the path
|
|
293
|
-
const pathParts = pageName.split("/");
|
|
294
|
-
|
|
295
|
-
if (pathParts.length === 1) {
|
|
296
|
-
// No category: just "page-name"
|
|
297
|
-
pages.push({ category: "", page: pathParts[0] });
|
|
298
|
-
} else if (pathParts.length === 2) {
|
|
299
|
-
// Has category: "category/page-name"
|
|
300
|
-
pages.push({ category: pathParts[0], page: pathParts[1] });
|
|
301
|
-
} else {
|
|
302
|
-
// Deeper path: join everything except last as category
|
|
303
|
-
const category = pathParts.slice(0, -1).join("/");
|
|
304
|
-
const page = pathParts[pathParts.length - 1];
|
|
305
|
-
pages.push({ category, page });
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
310
|
-
} catch (error) {
|
|
311
|
-
console.error("Error discovering pages:", error);
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
return pages;
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
/**
|
|
318
|
-
* Discover additional pages by parsing the docs index (fallback)
|
|
319
|
-
*/
|
|
320
|
-
async discoverPagesHtml(): Promise<string[]> {
|
|
321
|
-
const discovered: string[] = [];
|
|
322
|
-
|
|
323
|
-
try {
|
|
324
|
-
const indexUrl = `${this.options.baseUrl}${this.options.docsPath}`;
|
|
325
|
-
const response = await fetch(indexUrl, {
|
|
326
|
-
headers: {
|
|
327
|
-
Accept: "text/html",
|
|
328
|
-
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
329
|
-
},
|
|
330
|
-
});
|
|
331
|
-
|
|
332
|
-
if (!response.ok) {
|
|
333
|
-
return discovered;
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
const html = await response.text();
|
|
337
|
-
const mdLinkRegex = /href="\/docs\/en\/([^"]+\.md)"/g;
|
|
338
|
-
let match;
|
|
339
|
-
|
|
340
|
-
while ((match = mdLinkRegex.exec(html)) !== null) {
|
|
341
|
-
const path = match[1];
|
|
342
|
-
if (!discovered.includes(path)) {
|
|
343
|
-
discovered.push(path);
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
console.log(`Discovered ${discovered.length} additional pages from HTML`);
|
|
348
|
-
} catch (error) {
|
|
349
|
-
console.error("Error discovering pages from HTML:", error);
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
return discovered;
|
|
353
|
-
}
|
|
354
406
|
}
|
|
355
407
|
|
|
356
408
|
// ============================================================================
|
|
@@ -375,6 +427,72 @@ export async function scrapeMarkdownDocs(
|
|
|
375
427
|
return result;
|
|
376
428
|
}
|
|
377
429
|
|
|
430
|
+
// ============================================================================
|
|
431
|
+
// PRESET CONFIGURATIONS (Composable)
|
|
432
|
+
// ============================================================================
|
|
433
|
+
|
|
434
|
+
/** Pattern for Claude Code docs: /docs/en/page.md */
|
|
435
|
+
export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
436
|
+
|
|
437
|
+
/** Pattern for generic docs: any domain/path.md */
|
|
438
|
+
export const GENERIC_PATTERN = GENERIC_LINK_PATTERN;
|
|
439
|
+
|
|
440
|
+
/** Create scraper options for Claude Code docs */
|
|
441
|
+
export function claudeCodeOptions(outputDir: string): ScraperOptions {
|
|
442
|
+
return {
|
|
443
|
+
baseUrl: "https://code.claude.com",
|
|
444
|
+
docsPath: "/docs/en",
|
|
445
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
446
|
+
linkPattern: CLAUDE_CODE_PATTERN,
|
|
447
|
+
outputDir,
|
|
448
|
+
concurrency: 10,
|
|
449
|
+
tryDocsSubdomain: false,
|
|
450
|
+
useDirectUrls: false, // Claude Code can use built URLs
|
|
451
|
+
};
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/** Create scraper options for Polymarket docs */
|
|
455
|
+
export function polymarketOptions(outputDir: string): ScraperOptions {
|
|
456
|
+
return {
|
|
457
|
+
baseUrl: "https://docs.polymarket.com",
|
|
458
|
+
docsPath: "",
|
|
459
|
+
llmsPaths: ["/llms.txt"],
|
|
460
|
+
linkPattern: GENERIC_PATTERN,
|
|
461
|
+
outputDir,
|
|
462
|
+
concurrency: 10,
|
|
463
|
+
tryDocsSubdomain: false,
|
|
464
|
+
useDirectUrls: true, // Polymarket needs direct URLs
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// ============================================================================
|
|
469
|
+
// SCRAPERS MODULE
|
|
470
|
+
// ============================================================================
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Re-export scrapers module for composable scraper architecture.
|
|
474
|
+
* This provides a registry-based system for different scraper implementations.
|
|
475
|
+
*/
|
|
476
|
+
export {
|
|
477
|
+
// Types
|
|
478
|
+
type SourceType,
|
|
479
|
+
type SourceConfig,
|
|
480
|
+
type Scraper,
|
|
481
|
+
type ScrapeResult as ScraperModuleResult,
|
|
482
|
+
type DownloadResult,
|
|
483
|
+
|
|
484
|
+
// Scrapers
|
|
485
|
+
llmsTxtScraper,
|
|
486
|
+
githubRawScraper,
|
|
487
|
+
CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
|
|
488
|
+
GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
|
|
489
|
+
|
|
490
|
+
// Registry
|
|
491
|
+
registerScraper,
|
|
492
|
+
getScraper,
|
|
493
|
+
scrapeSource,
|
|
494
|
+
} from "./scrapers/index";
|
|
495
|
+
|
|
378
496
|
// ============================================================================
|
|
379
497
|
// EXPORTS
|
|
380
498
|
// ============================================================================
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Raw Scraper
|
|
3
|
+
*
|
|
4
|
+
* Downloads markdown files directly from GitHub repositories via raw content URLs.
|
|
5
|
+
* Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
|
|
9
|
+
|
|
10
|
+
// ============================================================================
|
|
11
|
+
// GITHUB API TYPES
|
|
12
|
+
// ============================================================================
|
|
13
|
+
|
|
14
|
+
interface GitHubContent {
|
|
15
|
+
name: string;
|
|
16
|
+
path: string;
|
|
17
|
+
download_url: string;
|
|
18
|
+
type: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// GITHUB RAW SCRAPER
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
export const githubRawScraper: Scraper = {
|
|
26
|
+
type: "github-raw",
|
|
27
|
+
|
|
28
|
+
async scrape(config: SourceConfig): Promise<ScrapeResult> {
|
|
29
|
+
const startTime = Date.now();
|
|
30
|
+
const downloaded: DownloadResult[] = [];
|
|
31
|
+
const failed: Array<{ url: string; error: string }> = [];
|
|
32
|
+
|
|
33
|
+
if (!config.github?.repo) {
|
|
34
|
+
throw new Error(`GitHub source "${config.name}" missing github.repo config`);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Get list of markdown files from GitHub API
|
|
38
|
+
const files = await fetchGitHubMarkdownFiles(
|
|
39
|
+
config.github.repo,
|
|
40
|
+
config.docsPath.replace(/^\//, "")
|
|
41
|
+
);
|
|
42
|
+
|
|
43
|
+
// Download each file
|
|
44
|
+
for (const file of files) {
|
|
45
|
+
const content = await fetchGitHubRawContent(config.github.repo, file.path);
|
|
46
|
+
|
|
47
|
+
if (content) {
|
|
48
|
+
downloaded.push({
|
|
49
|
+
success: true,
|
|
50
|
+
path: file.name,
|
|
51
|
+
title: extractTitle(content) || file.name.replace(".md", ""),
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// Save the file
|
|
55
|
+
await saveFile(config.outputDir, file.name, content);
|
|
56
|
+
} else {
|
|
57
|
+
failed.push({
|
|
58
|
+
url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
|
|
59
|
+
error: "Failed to fetch content",
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
downloaded,
|
|
66
|
+
failed,
|
|
67
|
+
duration: Date.now() - startTime,
|
|
68
|
+
};
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// ============================================================================
|
|
73
|
+
// GITHUB API FUNCTIONS
|
|
74
|
+
// ============================================================================
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Fetch list of markdown files from GitHub repo directory
|
|
78
|
+
*/
|
|
79
|
+
async function fetchGitHubMarkdownFiles(
|
|
80
|
+
repo: string,
|
|
81
|
+
path: string
|
|
82
|
+
): Promise<GitHubContent[]> {
|
|
83
|
+
const url = `https://api.github.com/repos/${repo}/contents/${path}`;
|
|
84
|
+
|
|
85
|
+
const response = await fetch(url, {
|
|
86
|
+
headers: {
|
|
87
|
+
Accept: "application/vnd.github.v3+json",
|
|
88
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
89
|
+
},
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
if (!response.ok) {
|
|
93
|
+
throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const contents: GitHubContent[] = await response.json();
|
|
97
|
+
|
|
98
|
+
// Filter for markdown files only
|
|
99
|
+
return contents.filter(
|
|
100
|
+
(item) => item.type === "file" && item.name.endsWith(".md")
|
|
101
|
+
);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Download markdown content from GitHub raw URL
|
|
106
|
+
*/
|
|
107
|
+
async function fetchGitHubRawContent(
|
|
108
|
+
repo: string,
|
|
109
|
+
path: string
|
|
110
|
+
): Promise<string | null> {
|
|
111
|
+
const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const response = await fetch(url, {
|
|
115
|
+
headers: {
|
|
116
|
+
Accept: "text/plain",
|
|
117
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
118
|
+
},
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
if (!response.ok) {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return await response.text();
|
|
126
|
+
} catch (error) {
|
|
127
|
+
console.error(`Error fetching ${url}:`, error);
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Extract title from markdown content
|
|
134
|
+
*/
|
|
135
|
+
function extractTitle(markdown: string): string | null {
|
|
136
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
137
|
+
return titleMatch ? titleMatch[1].trim() : null;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Save file to disk
|
|
142
|
+
*/
|
|
143
|
+
async function saveFile(
|
|
144
|
+
outputDir: string,
|
|
145
|
+
filename: string,
|
|
146
|
+
content: string
|
|
147
|
+
): Promise<void> {
|
|
148
|
+
const fs = await import("fs/promises");
|
|
149
|
+
const path = await import("path");
|
|
150
|
+
|
|
151
|
+
const outputPath = path.join(outputDir, filename);
|
|
152
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
153
|
+
await fs.writeFile(outputPath, content, "utf-8");
|
|
154
|
+
}
|