@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +1 -0
- package/dist/index.d.ts +128 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +219 -26
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +448 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +2 -0
- package/src/index.js +487 -0
- package/src/index.ts +115 -28
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/src/index.ts
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
* - Configurable llms.txt paths with fallbacks
|
|
6
6
|
* - Custom URL patterns for different doc sites
|
|
7
7
|
* - Works with any markdown documentation site
|
|
8
|
+
* - Uses full URLs from llms.txt directly
|
|
8
9
|
*/
|
|
9
10
|
|
|
10
11
|
// ============================================================================
|
|
@@ -32,6 +33,8 @@ export interface ScraperOptions {
|
|
|
32
33
|
tryDocsSubdomain?: boolean;
|
|
33
34
|
/** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
|
|
34
35
|
linkPattern?: RegExp;
|
|
36
|
+
/** Use full URLs from llms.txt directly (default: true for generic pattern) */
|
|
37
|
+
useDirectUrls?: boolean;
|
|
35
38
|
}
|
|
36
39
|
|
|
37
40
|
export interface ScraperResult {
|
|
@@ -40,11 +43,18 @@ export interface ScraperResult {
|
|
|
40
43
|
duration: number;
|
|
41
44
|
}
|
|
42
45
|
|
|
46
|
+
/** Discovered page with full URL */
|
|
47
|
+
interface DiscoveredPage {
|
|
48
|
+
category: string;
|
|
49
|
+
page: string;
|
|
50
|
+
fullUrl: string; // The complete URL from llms.txt
|
|
51
|
+
}
|
|
52
|
+
|
|
43
53
|
/** Default pattern: matches /docs/en/ or /docs/ paths */
|
|
44
54
|
const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
|
|
45
55
|
|
|
46
|
-
/** Generic pattern: matches any .md links
|
|
47
|
-
const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[
|
|
56
|
+
/** Generic pattern: matches any .md links - captures full path after domain */
|
|
57
|
+
const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
48
58
|
|
|
49
59
|
// ============================================================================
|
|
50
60
|
// UTILITY FUNCTIONS (Composable)
|
|
@@ -116,11 +126,12 @@ export class MarkdownDocsScraper {
|
|
|
116
126
|
llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
|
|
117
127
|
tryDocsSubdomain: options.tryDocsSubdomain ?? true,
|
|
118
128
|
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
|
|
129
|
+
useDirectUrls: options.useDirectUrls ?? true,
|
|
119
130
|
};
|
|
120
131
|
}
|
|
121
132
|
|
|
122
133
|
/**
|
|
123
|
-
* Build URL for a documentation page
|
|
134
|
+
* Build URL for a documentation page (fallback when no direct URL)
|
|
124
135
|
*/
|
|
125
136
|
buildUrl(category: string, page: string): string {
|
|
126
137
|
if (category) {
|
|
@@ -128,16 +139,19 @@ export class MarkdownDocsScraper {
|
|
|
128
139
|
} else if (this.options.docsPath) {
|
|
129
140
|
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
130
141
|
} else {
|
|
131
|
-
// No docsPath (like Polymarket) - direct path
|
|
132
142
|
return `${this.options.baseUrl}/${page}.md`;
|
|
133
143
|
}
|
|
134
144
|
}
|
|
135
145
|
|
|
136
146
|
/**
|
|
137
|
-
* Download a
|
|
147
|
+
* Download a page using either direct URL or built URL
|
|
138
148
|
*/
|
|
139
|
-
async downloadPage(
|
|
140
|
-
|
|
149
|
+
async downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null> {
|
|
150
|
+
// Use direct URL if available and useDirectUrls is enabled
|
|
151
|
+
const url = (this.options.useDirectUrls && pageInfo.fullUrl)
|
|
152
|
+
? pageInfo.fullUrl
|
|
153
|
+
: this.buildUrl(pageInfo.category, pageInfo.page);
|
|
154
|
+
|
|
141
155
|
const content = await fetchMarkdown(url);
|
|
142
156
|
|
|
143
157
|
if (!content) {
|
|
@@ -148,8 +162,8 @@ export class MarkdownDocsScraper {
|
|
|
148
162
|
url,
|
|
149
163
|
title: extractTitle(content),
|
|
150
164
|
content,
|
|
151
|
-
category,
|
|
152
|
-
pageName: page,
|
|
165
|
+
category: pageInfo.category,
|
|
166
|
+
pageName: pageInfo.page,
|
|
153
167
|
};
|
|
154
168
|
}
|
|
155
169
|
|
|
@@ -173,7 +187,6 @@ export class MarkdownDocsScraper {
|
|
|
173
187
|
|
|
174
188
|
// Skip if already on docs/doc subdomain
|
|
175
189
|
if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
|
|
176
|
-
// Try docs.{domain}
|
|
177
190
|
const docsDomain = hostname.replace(/^www\./, "");
|
|
178
191
|
urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
|
|
179
192
|
urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
|
|
@@ -191,9 +204,11 @@ export class MarkdownDocsScraper {
|
|
|
191
204
|
*/
|
|
192
205
|
private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
|
|
193
206
|
const urls = this.getLlmsUrls();
|
|
207
|
+
console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
|
|
194
208
|
|
|
195
209
|
for (const llmsUrl of urls) {
|
|
196
210
|
try {
|
|
211
|
+
console.log(`DEBUG: Fetching ${llmsUrl}...`);
|
|
197
212
|
const response = await fetch(llmsUrl, {
|
|
198
213
|
headers: {
|
|
199
214
|
Accept: "text/plain",
|
|
@@ -201,13 +216,14 @@ export class MarkdownDocsScraper {
|
|
|
201
216
|
},
|
|
202
217
|
});
|
|
203
218
|
|
|
219
|
+
console.log(`DEBUG: Response status: ${response.status}`);
|
|
204
220
|
if (response.ok) {
|
|
205
221
|
const content = await response.text();
|
|
206
222
|
console.log(`Found llms.txt at ${llmsUrl}`);
|
|
207
223
|
return { content, url: llmsUrl };
|
|
208
224
|
}
|
|
209
225
|
} catch (error) {
|
|
210
|
-
|
|
226
|
+
console.log(`DEBUG: Error: ${error}`);
|
|
211
227
|
continue;
|
|
212
228
|
}
|
|
213
229
|
}
|
|
@@ -218,8 +234,8 @@ export class MarkdownDocsScraper {
|
|
|
218
234
|
/**
|
|
219
235
|
* Discover pages from llms.txt index
|
|
220
236
|
*/
|
|
221
|
-
async discoverPages(): Promise<
|
|
222
|
-
const pages:
|
|
237
|
+
async discoverPages(): Promise<DiscoveredPage[]> {
|
|
238
|
+
const pages: DiscoveredPage[] = [];
|
|
223
239
|
|
|
224
240
|
try {
|
|
225
241
|
const llmsResult = await this.fetchLlmsTxt();
|
|
@@ -233,15 +249,20 @@ export class MarkdownDocsScraper {
|
|
|
233
249
|
const { content } = llmsResult;
|
|
234
250
|
|
|
235
251
|
// Use provided pattern or default
|
|
236
|
-
const
|
|
252
|
+
const pattern = this.options.linkPattern;
|
|
253
|
+
const regex = new RegExp(pattern.source, pattern.flags);
|
|
237
254
|
let match;
|
|
238
255
|
|
|
256
|
+
// Debug: log pattern being used
|
|
257
|
+
console.log(`DEBUG: Using pattern: ${pattern.source}`);
|
|
258
|
+
console.log(`DEBUG: Content length: ${content.length}`);
|
|
259
|
+
|
|
239
260
|
while ((match = regex.exec(content)) !== null) {
|
|
240
|
-
const
|
|
261
|
+
const fullUrl = match[2]; // The full URL from llms.txt
|
|
241
262
|
const pagePath = match[3]; // The captured path group
|
|
242
263
|
|
|
243
264
|
const { category, page } = parsePagePath(pagePath);
|
|
244
|
-
pages.push({ category, page });
|
|
265
|
+
pages.push({ category, page, fullUrl });
|
|
245
266
|
}
|
|
246
267
|
|
|
247
268
|
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
@@ -273,7 +294,7 @@ export class MarkdownDocsScraper {
|
|
|
273
294
|
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
274
295
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
275
296
|
const results = await Promise.allSettled(
|
|
276
|
-
batch.map((page) => this.downloadPage(page
|
|
297
|
+
batch.map((page) => this.downloadPage(page))
|
|
277
298
|
);
|
|
278
299
|
|
|
279
300
|
results.forEach((result, index) => {
|
|
@@ -281,8 +302,11 @@ export class MarkdownDocsScraper {
|
|
|
281
302
|
if (result.status === "fulfilled" && result.value) {
|
|
282
303
|
downloaded.push(result.value);
|
|
283
304
|
} else {
|
|
305
|
+
const url = (this.options.useDirectUrls && page.fullUrl)
|
|
306
|
+
? page.fullUrl
|
|
307
|
+
: this.buildUrl(page.category, page.page);
|
|
284
308
|
failed.push({
|
|
285
|
-
url
|
|
309
|
+
url,
|
|
286
310
|
error: result.status === "rejected" ? (result.reason as string) : "Not found",
|
|
287
311
|
});
|
|
288
312
|
}
|
|
@@ -316,7 +340,7 @@ export class MarkdownDocsScraper {
|
|
|
316
340
|
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
317
341
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
318
342
|
const results = await Promise.allSettled(
|
|
319
|
-
batch.map((page) => this.downloadPage(page
|
|
343
|
+
batch.map((page) => this.downloadPage({ ...page, fullUrl: "" }))
|
|
320
344
|
);
|
|
321
345
|
|
|
322
346
|
results.forEach((result, index) => {
|
|
@@ -343,14 +367,25 @@ export class MarkdownDocsScraper {
|
|
|
343
367
|
}
|
|
344
368
|
|
|
345
369
|
/**
|
|
346
|
-
*
|
|
370
|
+
* Extract body content from a file (strips header comment)
|
|
371
|
+
*/
|
|
372
|
+
private extractBody(content: string): string {
|
|
373
|
+
// Match header comment and remove it
|
|
374
|
+
const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
|
|
375
|
+
return content.replace(headerRegex, "");
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Save scraped pages to disk (only writes if content changed)
|
|
347
380
|
*/
|
|
348
|
-
async savePages(pages: DocPage[]): Promise<
|
|
381
|
+
async savePages(pages: DocPage[]): Promise<{ updated: number; skipped: number }> {
|
|
349
382
|
const fs = await import("fs/promises");
|
|
350
383
|
const path = await import("path");
|
|
351
384
|
|
|
385
|
+
let updated = 0;
|
|
386
|
+
let skipped = 0;
|
|
387
|
+
|
|
352
388
|
for (const page of pages) {
|
|
353
|
-
// Use pageName if available, otherwise extract from URL
|
|
354
389
|
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
355
390
|
|
|
356
391
|
const dir = page.category
|
|
@@ -361,20 +396,38 @@ export class MarkdownDocsScraper {
|
|
|
361
396
|
|
|
362
397
|
const filepath = path.join(dir, `${nameToUse}.md`);
|
|
363
398
|
|
|
399
|
+
// Check if file exists and compare content
|
|
400
|
+
try {
|
|
401
|
+
const existingContent = await fs.readFile(filepath, "utf-8");
|
|
402
|
+
const existingBody = this.extractBody(existingContent);
|
|
403
|
+
|
|
404
|
+
// Skip if content unchanged
|
|
405
|
+
if (existingBody === page.content) {
|
|
406
|
+
skipped++;
|
|
407
|
+
continue;
|
|
408
|
+
}
|
|
409
|
+
} catch {
|
|
410
|
+
// File doesn't exist, will create it
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// Content changed or new file - write it
|
|
364
414
|
const header = `<!--\nSource: ${page.url}\nDownloaded: ${new Date().toISOString()}\n-->\n\n`;
|
|
365
415
|
await fs.writeFile(filepath, header + page.content, "utf-8");
|
|
416
|
+
updated++;
|
|
366
417
|
}
|
|
418
|
+
|
|
419
|
+
return { updated, skipped };
|
|
367
420
|
}
|
|
368
421
|
|
|
369
422
|
/**
|
|
370
423
|
* Get list of pages to scrape based on categories
|
|
371
424
|
*/
|
|
372
|
-
private getPagesToScrape():
|
|
373
|
-
const pages:
|
|
425
|
+
private getPagesToScrape(): DiscoveredPage[] {
|
|
426
|
+
const pages: DiscoveredPage[] = [];
|
|
374
427
|
|
|
375
428
|
for (const [category, pageList] of Object.entries(this.options.categories)) {
|
|
376
429
|
for (const page of pageList) {
|
|
377
|
-
pages.push({ category, page });
|
|
430
|
+
pages.push({ category, page, fullUrl: "" });
|
|
378
431
|
}
|
|
379
432
|
}
|
|
380
433
|
|
|
@@ -391,17 +444,21 @@ export class MarkdownDocsScraper {
|
|
|
391
444
|
*/
|
|
392
445
|
export async function scrapeMarkdownDocs(
|
|
393
446
|
options: ScraperOptions & { useLlms?: boolean }
|
|
394
|
-
): Promise<ScraperResult> {
|
|
447
|
+
): Promise<ScraperResult & { saveStats?: { updated: number; skipped: number } }> {
|
|
395
448
|
const scraper = new MarkdownDocsScraper(options);
|
|
396
449
|
const result = options.useLlms
|
|
397
450
|
? await scraper.scrapeFromLlms()
|
|
398
451
|
: await scraper.scrape();
|
|
399
452
|
|
|
453
|
+
let saveStats;
|
|
400
454
|
if (options.outputDir) {
|
|
401
|
-
await scraper.savePages(result.downloaded);
|
|
455
|
+
saveStats = await scraper.savePages(result.downloaded);
|
|
456
|
+
if (saveStats.updated > 0 || saveStats.skipped > 0) {
|
|
457
|
+
console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
|
|
458
|
+
}
|
|
402
459
|
}
|
|
403
460
|
|
|
404
|
-
return result;
|
|
461
|
+
return { ...result, saveStats };
|
|
405
462
|
}
|
|
406
463
|
|
|
407
464
|
// ============================================================================
|
|
@@ -424,6 +481,7 @@ export function claudeCodeOptions(outputDir: string): ScraperOptions {
|
|
|
424
481
|
outputDir,
|
|
425
482
|
concurrency: 10,
|
|
426
483
|
tryDocsSubdomain: false,
|
|
484
|
+
useDirectUrls: false, // Claude Code can use built URLs
|
|
427
485
|
};
|
|
428
486
|
}
|
|
429
487
|
|
|
@@ -437,9 +495,38 @@ export function polymarketOptions(outputDir: string): ScraperOptions {
|
|
|
437
495
|
outputDir,
|
|
438
496
|
concurrency: 10,
|
|
439
497
|
tryDocsSubdomain: false,
|
|
498
|
+
useDirectUrls: true, // Polymarket needs direct URLs
|
|
440
499
|
};
|
|
441
500
|
}
|
|
442
501
|
|
|
502
|
+
// ============================================================================
|
|
503
|
+
// SCRAPERS MODULE
|
|
504
|
+
// ============================================================================
|
|
505
|
+
|
|
506
|
+
/**
|
|
507
|
+
* Re-export scrapers module for composable scraper architecture.
|
|
508
|
+
* This provides a registry-based system for different scraper implementations.
|
|
509
|
+
*/
|
|
510
|
+
export {
|
|
511
|
+
// Types
|
|
512
|
+
type SourceType,
|
|
513
|
+
type SourceConfig,
|
|
514
|
+
type Scraper,
|
|
515
|
+
type ScrapeResult as ScraperModuleResult,
|
|
516
|
+
type DownloadResult,
|
|
517
|
+
|
|
518
|
+
// Scrapers
|
|
519
|
+
llmsTxtScraper,
|
|
520
|
+
githubRawScraper,
|
|
521
|
+
CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
|
|
522
|
+
GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
|
|
523
|
+
|
|
524
|
+
// Registry
|
|
525
|
+
registerScraper,
|
|
526
|
+
getScraper,
|
|
527
|
+
scrapeSource,
|
|
528
|
+
} from "./scrapers/index";
|
|
529
|
+
|
|
443
530
|
// ============================================================================
|
|
444
531
|
// EXPORTS
|
|
445
532
|
// ============================================================================
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Raw Scraper
|
|
3
|
+
*
|
|
4
|
+
* Downloads markdown files directly from GitHub repositories via raw content URLs.
|
|
5
|
+
* Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
|
|
9
|
+
|
|
10
|
+
// ============================================================================
|
|
11
|
+
// GITHUB API TYPES
|
|
12
|
+
// ============================================================================
|
|
13
|
+
|
|
14
|
+
interface GitHubContent {
|
|
15
|
+
name: string;
|
|
16
|
+
path: string;
|
|
17
|
+
download_url: string;
|
|
18
|
+
type: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// GITHUB RAW SCRAPER
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
export const githubRawScraper: Scraper = {
|
|
26
|
+
type: "github-raw",
|
|
27
|
+
|
|
28
|
+
async scrape(config: SourceConfig): Promise<ScrapeResult> {
|
|
29
|
+
const startTime = Date.now();
|
|
30
|
+
const downloaded: DownloadResult[] = [];
|
|
31
|
+
const failed: Array<{ url: string; error: string }> = [];
|
|
32
|
+
|
|
33
|
+
if (!config.github?.repo) {
|
|
34
|
+
throw new Error(`GitHub source "${config.name}" missing github.repo config`);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Get list of markdown files from GitHub API
|
|
38
|
+
const files = await fetchGitHubMarkdownFiles(
|
|
39
|
+
config.github.repo,
|
|
40
|
+
config.docsPath.replace(/^\//, "")
|
|
41
|
+
);
|
|
42
|
+
|
|
43
|
+
// Download each file
|
|
44
|
+
for (const file of files) {
|
|
45
|
+
const content = await fetchGitHubRawContent(config.github.repo, file.path);
|
|
46
|
+
|
|
47
|
+
if (content) {
|
|
48
|
+
downloaded.push({
|
|
49
|
+
success: true,
|
|
50
|
+
path: file.name,
|
|
51
|
+
title: extractTitle(content) || file.name.replace(".md", ""),
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// Save the file
|
|
55
|
+
await saveFile(config.outputDir, file.name, content);
|
|
56
|
+
} else {
|
|
57
|
+
failed.push({
|
|
58
|
+
url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
|
|
59
|
+
error: "Failed to fetch content",
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
downloaded,
|
|
66
|
+
failed,
|
|
67
|
+
duration: Date.now() - startTime,
|
|
68
|
+
};
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// ============================================================================
|
|
73
|
+
// GITHUB API FUNCTIONS
|
|
74
|
+
// ============================================================================
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Fetch list of markdown files from GitHub repo directory
|
|
78
|
+
*/
|
|
79
|
+
async function fetchGitHubMarkdownFiles(
|
|
80
|
+
repo: string,
|
|
81
|
+
path: string
|
|
82
|
+
): Promise<GitHubContent[]> {
|
|
83
|
+
const url = `https://api.github.com/repos/${repo}/contents/${path}`;
|
|
84
|
+
|
|
85
|
+
const response = await fetch(url, {
|
|
86
|
+
headers: {
|
|
87
|
+
Accept: "application/vnd.github.v3+json",
|
|
88
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
89
|
+
},
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
if (!response.ok) {
|
|
93
|
+
throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const contents: GitHubContent[] = await response.json();
|
|
97
|
+
|
|
98
|
+
// Filter for markdown files only
|
|
99
|
+
return contents.filter(
|
|
100
|
+
(item) => item.type === "file" && item.name.endsWith(".md")
|
|
101
|
+
);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Download markdown content from GitHub raw URL
|
|
106
|
+
*/
|
|
107
|
+
async function fetchGitHubRawContent(
|
|
108
|
+
repo: string,
|
|
109
|
+
path: string
|
|
110
|
+
): Promise<string | null> {
|
|
111
|
+
const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const response = await fetch(url, {
|
|
115
|
+
headers: {
|
|
116
|
+
Accept: "text/plain",
|
|
117
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
118
|
+
},
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
if (!response.ok) {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return await response.text();
|
|
126
|
+
} catch (error) {
|
|
127
|
+
console.error(`Error fetching ${url}:`, error);
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Extract title from markdown content
|
|
134
|
+
*/
|
|
135
|
+
function extractTitle(markdown: string): string | null {
|
|
136
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
137
|
+
return titleMatch ? titleMatch[1].trim() : null;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Save file to disk
|
|
142
|
+
*/
|
|
143
|
+
async function saveFile(
|
|
144
|
+
outputDir: string,
|
|
145
|
+
filename: string,
|
|
146
|
+
content: string
|
|
147
|
+
): Promise<void> {
|
|
148
|
+
const fs = await import("fs/promises");
|
|
149
|
+
const path = await import("path");
|
|
150
|
+
|
|
151
|
+
const outputPath = path.join(outputDir, filename);
|
|
152
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
153
|
+
await fs.writeFile(outputPath, content, "utf-8");
|
|
154
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scrapers Module
|
|
3
|
+
*
|
|
4
|
+
* Composable scraper architecture for multiple documentation source types.
|
|
5
|
+
* This module provides a registry-based system for different scraper implementations.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Types
|
|
9
|
+
export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
|
|
10
|
+
|
|
11
|
+
// Scrapers
|
|
12
|
+
export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
|
|
13
|
+
export { githubRawScraper } from "./github-raw";
|
|
14
|
+
|
|
15
|
+
// Registry
|
|
16
|
+
export { registerScraper, getScraper, scrapeSource } from "./registry";
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLMS-TXT Scraper
|
|
3
|
+
*
|
|
4
|
+
* Scrapes documentation sites that provide llms.txt index files.
|
|
5
|
+
* Uses the core MarkdownDocsScraper under the hood.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { scrapeMarkdownDocs, type DocPage } from "../index";
|
|
9
|
+
import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
|
|
10
|
+
|
|
11
|
+
// ============================================================================
|
|
12
|
+
// URL PATTERNS
|
|
13
|
+
// ============================================================================
|
|
14
|
+
|
|
15
|
+
/** Pattern for Claude Code docs: /docs/en/page.md */
|
|
16
|
+
export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
17
|
+
|
|
18
|
+
/** Pattern for generic docs: any domain/path.md */
|
|
19
|
+
export const GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// LLMS-TXT SCRAPER
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
export const llmsTxtScraper: Scraper = {
|
|
26
|
+
type: "llms-txt",
|
|
27
|
+
|
|
28
|
+
async scrape(config: SourceConfig): Promise<ScrapeResult> {
|
|
29
|
+
const options = getScraperOptions(config);
|
|
30
|
+
const result = await scrapeMarkdownDocs(options);
|
|
31
|
+
|
|
32
|
+
// Convert DocPage[] to DownloadResult[]
|
|
33
|
+
const downloaded: DownloadResult[] = result.downloaded.map((page: DocPage) => {
|
|
34
|
+
const category = page.category || "";
|
|
35
|
+
const filename = `${page.pageName || "untitled"}.md`;
|
|
36
|
+
const path = category ? `${category}/${filename}` : filename;
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
success: true,
|
|
40
|
+
path,
|
|
41
|
+
title: page.title,
|
|
42
|
+
};
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
downloaded,
|
|
47
|
+
failed: result.failed,
|
|
48
|
+
duration: result.duration,
|
|
49
|
+
};
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// ============================================================================
|
|
54
|
+
// OPTIONS BUILDER
|
|
55
|
+
// ============================================================================
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Get scraper options based on source configuration
|
|
59
|
+
*/
|
|
60
|
+
function getScraperOptions(config: SourceConfig) {
|
|
61
|
+
const baseOptions = {
|
|
62
|
+
baseUrl: config.baseUrl,
|
|
63
|
+
docsPath: config.docsPath,
|
|
64
|
+
outputDir: config.outputDir,
|
|
65
|
+
concurrency: 10,
|
|
66
|
+
useLlms: true,
|
|
67
|
+
tryDocsSubdomain: false,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Source-specific options
|
|
71
|
+
if (config.name === "Claude Code") {
|
|
72
|
+
return {
|
|
73
|
+
...baseOptions,
|
|
74
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
75
|
+
linkPattern: CLAUDE_CODE_PATTERN,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (config.name === "Polymarket") {
|
|
80
|
+
return {
|
|
81
|
+
...baseOptions,
|
|
82
|
+
llmsPaths: ["/llms.txt"],
|
|
83
|
+
linkPattern: GENERIC_PATTERN,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (config.name === "Bun") {
|
|
88
|
+
return {
|
|
89
|
+
...baseOptions,
|
|
90
|
+
llmsPaths: ["/docs/llms.txt", "/llms.txt"],
|
|
91
|
+
linkPattern: GENERIC_PATTERN,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Default: use provided llmsTxtPath or try common paths
|
|
96
|
+
return {
|
|
97
|
+
...baseOptions,
|
|
98
|
+
llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
|
|
99
|
+
linkPattern: config.linkPattern || GENERIC_PATTERN,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scraper Registry
|
|
3
|
+
*
|
|
4
|
+
* Maps source types to scraper implementations.
|
|
5
|
+
* Allows registering new scrapers and looking them up by type.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Scraper, SourceType, SourceConfig, ScrapeResult } from "./types";
|
|
9
|
+
import { llmsTxtScraper } from "./llms-txt";
|
|
10
|
+
import { githubRawScraper } from "./github-raw";
|
|
11
|
+
|
|
12
|
+
// ============================================================================
|
|
13
|
+
// SCRAPER REGISTRY
|
|
14
|
+
// ============================================================================
|
|
15
|
+
|
|
16
|
+
/** Registry of all available scrapers keyed by type */
|
|
17
|
+
const scrapers: Map<SourceType, Scraper> = new Map();
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Register a scraper implementation
|
|
21
|
+
*/
|
|
22
|
+
export function registerScraper(scraper: Scraper): void {
|
|
23
|
+
scrapers.set(scraper.type, scraper);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Get a scraper by type
|
|
28
|
+
*/
|
|
29
|
+
export function getScraper(type: SourceType): Scraper | undefined {
|
|
30
|
+
return scrapers.get(type);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Scrape a source using the appropriate scraper
|
|
35
|
+
*/
|
|
36
|
+
export async function scrapeSource(config: SourceConfig): Promise<ScrapeResult> {
|
|
37
|
+
const scraper = scrapers.get(config.sourceType);
|
|
38
|
+
|
|
39
|
+
if (!scraper) {
|
|
40
|
+
throw new Error(`No scraper registered for type: ${config.sourceType}`);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return scraper.scrape(config);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// ============================================================================
|
|
47
|
+
// DEFAULT REGISTRATIONS
|
|
48
|
+
// ============================================================================
|
|
49
|
+
|
|
50
|
+
// Register built-in scrapers
|
|
51
|
+
registerScraper(llmsTxtScraper);
|
|
52
|
+
registerScraper(githubRawScraper);
|
|
53
|
+
|
|
54
|
+
// Export scrapers for direct access if needed
|
|
55
|
+
export { llmsTxtScraper, githubRawScraper };
|