@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +1 -0
- package/dist/index.d.ts +116 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +197 -24
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +428 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +2 -0
- package/src/index.js +487 -0
- package/src/index.ts +76 -23
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/src/index.ts
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
* - Configurable llms.txt paths with fallbacks
|
|
6
6
|
* - Custom URL patterns for different doc sites
|
|
7
7
|
* - Works with any markdown documentation site
|
|
8
|
+
* - Uses full URLs from llms.txt directly
|
|
8
9
|
*/
|
|
9
10
|
|
|
10
11
|
// ============================================================================
|
|
@@ -32,6 +33,8 @@ export interface ScraperOptions {
|
|
|
32
33
|
tryDocsSubdomain?: boolean;
|
|
33
34
|
/** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
|
|
34
35
|
linkPattern?: RegExp;
|
|
36
|
+
/** Use full URLs from llms.txt directly (default: true for generic pattern) */
|
|
37
|
+
useDirectUrls?: boolean;
|
|
35
38
|
}
|
|
36
39
|
|
|
37
40
|
export interface ScraperResult {
|
|
@@ -40,11 +43,18 @@ export interface ScraperResult {
|
|
|
40
43
|
duration: number;
|
|
41
44
|
}
|
|
42
45
|
|
|
46
|
+
/** Discovered page with full URL */
|
|
47
|
+
interface DiscoveredPage {
|
|
48
|
+
category: string;
|
|
49
|
+
page: string;
|
|
50
|
+
fullUrl: string; // The complete URL from llms.txt
|
|
51
|
+
}
|
|
52
|
+
|
|
43
53
|
/** Default pattern: matches /docs/en/ or /docs/ paths */
|
|
44
54
|
const DEFAULT_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/(?:en\/)?([^)]+\.md))\)/g;
|
|
45
55
|
|
|
46
|
-
/** Generic pattern: matches any .md links
|
|
47
|
-
const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[
|
|
56
|
+
/** Generic pattern: matches any .md links - captures full path after domain */
|
|
57
|
+
const GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
48
58
|
|
|
49
59
|
// ============================================================================
|
|
50
60
|
// UTILITY FUNCTIONS (Composable)
|
|
@@ -116,11 +126,12 @@ export class MarkdownDocsScraper {
|
|
|
116
126
|
llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
|
|
117
127
|
tryDocsSubdomain: options.tryDocsSubdomain ?? true,
|
|
118
128
|
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
|
|
129
|
+
useDirectUrls: options.useDirectUrls ?? true,
|
|
119
130
|
};
|
|
120
131
|
}
|
|
121
132
|
|
|
122
133
|
/**
|
|
123
|
-
* Build URL for a documentation page
|
|
134
|
+
* Build URL for a documentation page (fallback when no direct URL)
|
|
124
135
|
*/
|
|
125
136
|
buildUrl(category: string, page: string): string {
|
|
126
137
|
if (category) {
|
|
@@ -128,16 +139,19 @@ export class MarkdownDocsScraper {
|
|
|
128
139
|
} else if (this.options.docsPath) {
|
|
129
140
|
return `${this.options.baseUrl}${this.options.docsPath}/${page}.md`;
|
|
130
141
|
} else {
|
|
131
|
-
// No docsPath (like Polymarket) - direct path
|
|
132
142
|
return `${this.options.baseUrl}/${page}.md`;
|
|
133
143
|
}
|
|
134
144
|
}
|
|
135
145
|
|
|
136
146
|
/**
|
|
137
|
-
* Download a
|
|
147
|
+
* Download a page using either direct URL or built URL
|
|
138
148
|
*/
|
|
139
|
-
async downloadPage(
|
|
140
|
-
|
|
149
|
+
async downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null> {
|
|
150
|
+
// Use direct URL if available and useDirectUrls is enabled
|
|
151
|
+
const url = (this.options.useDirectUrls && pageInfo.fullUrl)
|
|
152
|
+
? pageInfo.fullUrl
|
|
153
|
+
: this.buildUrl(pageInfo.category, pageInfo.page);
|
|
154
|
+
|
|
141
155
|
const content = await fetchMarkdown(url);
|
|
142
156
|
|
|
143
157
|
if (!content) {
|
|
@@ -148,8 +162,8 @@ export class MarkdownDocsScraper {
|
|
|
148
162
|
url,
|
|
149
163
|
title: extractTitle(content),
|
|
150
164
|
content,
|
|
151
|
-
category,
|
|
152
|
-
pageName: page,
|
|
165
|
+
category: pageInfo.category,
|
|
166
|
+
pageName: pageInfo.page,
|
|
153
167
|
};
|
|
154
168
|
}
|
|
155
169
|
|
|
@@ -173,7 +187,6 @@ export class MarkdownDocsScraper {
|
|
|
173
187
|
|
|
174
188
|
// Skip if already on docs/doc subdomain
|
|
175
189
|
if (!hostname.startsWith("docs.") && !hostname.startsWith("doc.")) {
|
|
176
|
-
// Try docs.{domain}
|
|
177
190
|
const docsDomain = hostname.replace(/^www\./, "");
|
|
178
191
|
urls.push(`${url.protocol}//docs.${docsDomain}/llms.txt`);
|
|
179
192
|
urls.push(`${url.protocol}//docs.${docsDomain}/docs/llms.txt`);
|
|
@@ -191,9 +204,11 @@ export class MarkdownDocsScraper {
|
|
|
191
204
|
*/
|
|
192
205
|
private async fetchLlmsTxt(): Promise<{ content: string; url: string } | null> {
|
|
193
206
|
const urls = this.getLlmsUrls();
|
|
207
|
+
console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
|
|
194
208
|
|
|
195
209
|
for (const llmsUrl of urls) {
|
|
196
210
|
try {
|
|
211
|
+
console.log(`DEBUG: Fetching ${llmsUrl}...`);
|
|
197
212
|
const response = await fetch(llmsUrl, {
|
|
198
213
|
headers: {
|
|
199
214
|
Accept: "text/plain",
|
|
@@ -201,13 +216,14 @@ export class MarkdownDocsScraper {
|
|
|
201
216
|
},
|
|
202
217
|
});
|
|
203
218
|
|
|
219
|
+
console.log(`DEBUG: Response status: ${response.status}`);
|
|
204
220
|
if (response.ok) {
|
|
205
221
|
const content = await response.text();
|
|
206
222
|
console.log(`Found llms.txt at ${llmsUrl}`);
|
|
207
223
|
return { content, url: llmsUrl };
|
|
208
224
|
}
|
|
209
225
|
} catch (error) {
|
|
210
|
-
|
|
226
|
+
console.log(`DEBUG: Error: ${error}`);
|
|
211
227
|
continue;
|
|
212
228
|
}
|
|
213
229
|
}
|
|
@@ -218,8 +234,8 @@ export class MarkdownDocsScraper {
|
|
|
218
234
|
/**
|
|
219
235
|
* Discover pages from llms.txt index
|
|
220
236
|
*/
|
|
221
|
-
async discoverPages(): Promise<
|
|
222
|
-
const pages:
|
|
237
|
+
async discoverPages(): Promise<DiscoveredPage[]> {
|
|
238
|
+
const pages: DiscoveredPage[] = [];
|
|
223
239
|
|
|
224
240
|
try {
|
|
225
241
|
const llmsResult = await this.fetchLlmsTxt();
|
|
@@ -233,15 +249,20 @@ export class MarkdownDocsScraper {
|
|
|
233
249
|
const { content } = llmsResult;
|
|
234
250
|
|
|
235
251
|
// Use provided pattern or default
|
|
236
|
-
const
|
|
252
|
+
const pattern = this.options.linkPattern;
|
|
253
|
+
const regex = new RegExp(pattern.source, pattern.flags);
|
|
237
254
|
let match;
|
|
238
255
|
|
|
256
|
+
// Debug: log pattern being used
|
|
257
|
+
console.log(`DEBUG: Using pattern: ${pattern.source}`);
|
|
258
|
+
console.log(`DEBUG: Content length: ${content.length}`);
|
|
259
|
+
|
|
239
260
|
while ((match = regex.exec(content)) !== null) {
|
|
240
|
-
const
|
|
261
|
+
const fullUrl = match[2]; // The full URL from llms.txt
|
|
241
262
|
const pagePath = match[3]; // The captured path group
|
|
242
263
|
|
|
243
264
|
const { category, page } = parsePagePath(pagePath);
|
|
244
|
-
pages.push({ category, page });
|
|
265
|
+
pages.push({ category, page, fullUrl });
|
|
245
266
|
}
|
|
246
267
|
|
|
247
268
|
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
@@ -273,7 +294,7 @@ export class MarkdownDocsScraper {
|
|
|
273
294
|
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
274
295
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
275
296
|
const results = await Promise.allSettled(
|
|
276
|
-
batch.map((page) => this.downloadPage(page
|
|
297
|
+
batch.map((page) => this.downloadPage(page))
|
|
277
298
|
);
|
|
278
299
|
|
|
279
300
|
results.forEach((result, index) => {
|
|
@@ -281,8 +302,11 @@ export class MarkdownDocsScraper {
|
|
|
281
302
|
if (result.status === "fulfilled" && result.value) {
|
|
282
303
|
downloaded.push(result.value);
|
|
283
304
|
} else {
|
|
305
|
+
const url = (this.options.useDirectUrls && page.fullUrl)
|
|
306
|
+
? page.fullUrl
|
|
307
|
+
: this.buildUrl(page.category, page.page);
|
|
284
308
|
failed.push({
|
|
285
|
-
url
|
|
309
|
+
url,
|
|
286
310
|
error: result.status === "rejected" ? (result.reason as string) : "Not found",
|
|
287
311
|
});
|
|
288
312
|
}
|
|
@@ -316,7 +340,7 @@ export class MarkdownDocsScraper {
|
|
|
316
340
|
for (let i = 0; i < pages.length; i += this.options.concurrency) {
|
|
317
341
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
318
342
|
const results = await Promise.allSettled(
|
|
319
|
-
batch.map((page) => this.downloadPage(page
|
|
343
|
+
batch.map((page) => this.downloadPage({ ...page, fullUrl: "" }))
|
|
320
344
|
);
|
|
321
345
|
|
|
322
346
|
results.forEach((result, index) => {
|
|
@@ -350,7 +374,6 @@ export class MarkdownDocsScraper {
|
|
|
350
374
|
const path = await import("path");
|
|
351
375
|
|
|
352
376
|
for (const page of pages) {
|
|
353
|
-
// Use pageName if available, otherwise extract from URL
|
|
354
377
|
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
355
378
|
|
|
356
379
|
const dir = page.category
|
|
@@ -369,12 +392,12 @@ export class MarkdownDocsScraper {
|
|
|
369
392
|
/**
|
|
370
393
|
* Get list of pages to scrape based on categories
|
|
371
394
|
*/
|
|
372
|
-
private getPagesToScrape():
|
|
373
|
-
const pages:
|
|
395
|
+
private getPagesToScrape(): DiscoveredPage[] {
|
|
396
|
+
const pages: DiscoveredPage[] = [];
|
|
374
397
|
|
|
375
398
|
for (const [category, pageList] of Object.entries(this.options.categories)) {
|
|
376
399
|
for (const page of pageList) {
|
|
377
|
-
pages.push({ category, page });
|
|
400
|
+
pages.push({ category, page, fullUrl: "" });
|
|
378
401
|
}
|
|
379
402
|
}
|
|
380
403
|
|
|
@@ -424,6 +447,7 @@ export function claudeCodeOptions(outputDir: string): ScraperOptions {
|
|
|
424
447
|
outputDir,
|
|
425
448
|
concurrency: 10,
|
|
426
449
|
tryDocsSubdomain: false,
|
|
450
|
+
useDirectUrls: false, // Claude Code can use built URLs
|
|
427
451
|
};
|
|
428
452
|
}
|
|
429
453
|
|
|
@@ -437,9 +461,38 @@ export function polymarketOptions(outputDir: string): ScraperOptions {
|
|
|
437
461
|
outputDir,
|
|
438
462
|
concurrency: 10,
|
|
439
463
|
tryDocsSubdomain: false,
|
|
464
|
+
useDirectUrls: true, // Polymarket needs direct URLs
|
|
440
465
|
};
|
|
441
466
|
}
|
|
442
467
|
|
|
468
|
+
// ============================================================================
|
|
469
|
+
// SCRAPERS MODULE
|
|
470
|
+
// ============================================================================
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Re-export scrapers module for composable scraper architecture.
|
|
474
|
+
* This provides a registry-based system for different scraper implementations.
|
|
475
|
+
*/
|
|
476
|
+
export {
|
|
477
|
+
// Types
|
|
478
|
+
type SourceType,
|
|
479
|
+
type SourceConfig,
|
|
480
|
+
type Scraper,
|
|
481
|
+
type ScrapeResult as ScraperModuleResult,
|
|
482
|
+
type DownloadResult,
|
|
483
|
+
|
|
484
|
+
// Scrapers
|
|
485
|
+
llmsTxtScraper,
|
|
486
|
+
githubRawScraper,
|
|
487
|
+
CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
|
|
488
|
+
GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
|
|
489
|
+
|
|
490
|
+
// Registry
|
|
491
|
+
registerScraper,
|
|
492
|
+
getScraper,
|
|
493
|
+
scrapeSource,
|
|
494
|
+
} from "./scrapers/index";
|
|
495
|
+
|
|
443
496
|
// ============================================================================
|
|
444
497
|
// EXPORTS
|
|
445
498
|
// ============================================================================
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Raw Scraper
|
|
3
|
+
*
|
|
4
|
+
* Downloads markdown files directly from GitHub repositories via raw content URLs.
|
|
5
|
+
* Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
|
|
9
|
+
|
|
10
|
+
// ============================================================================
|
|
11
|
+
// GITHUB API TYPES
|
|
12
|
+
// ============================================================================
|
|
13
|
+
|
|
14
|
+
interface GitHubContent {
|
|
15
|
+
name: string;
|
|
16
|
+
path: string;
|
|
17
|
+
download_url: string;
|
|
18
|
+
type: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// GITHUB RAW SCRAPER
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
export const githubRawScraper: Scraper = {
|
|
26
|
+
type: "github-raw",
|
|
27
|
+
|
|
28
|
+
async scrape(config: SourceConfig): Promise<ScrapeResult> {
|
|
29
|
+
const startTime = Date.now();
|
|
30
|
+
const downloaded: DownloadResult[] = [];
|
|
31
|
+
const failed: Array<{ url: string; error: string }> = [];
|
|
32
|
+
|
|
33
|
+
if (!config.github?.repo) {
|
|
34
|
+
throw new Error(`GitHub source "${config.name}" missing github.repo config`);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Get list of markdown files from GitHub API
|
|
38
|
+
const files = await fetchGitHubMarkdownFiles(
|
|
39
|
+
config.github.repo,
|
|
40
|
+
config.docsPath.replace(/^\//, "")
|
|
41
|
+
);
|
|
42
|
+
|
|
43
|
+
// Download each file
|
|
44
|
+
for (const file of files) {
|
|
45
|
+
const content = await fetchGitHubRawContent(config.github.repo, file.path);
|
|
46
|
+
|
|
47
|
+
if (content) {
|
|
48
|
+
downloaded.push({
|
|
49
|
+
success: true,
|
|
50
|
+
path: file.name,
|
|
51
|
+
title: extractTitle(content) || file.name.replace(".md", ""),
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// Save the file
|
|
55
|
+
await saveFile(config.outputDir, file.name, content);
|
|
56
|
+
} else {
|
|
57
|
+
failed.push({
|
|
58
|
+
url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
|
|
59
|
+
error: "Failed to fetch content",
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
downloaded,
|
|
66
|
+
failed,
|
|
67
|
+
duration: Date.now() - startTime,
|
|
68
|
+
};
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// ============================================================================
|
|
73
|
+
// GITHUB API FUNCTIONS
|
|
74
|
+
// ============================================================================
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Fetch list of markdown files from GitHub repo directory
|
|
78
|
+
*/
|
|
79
|
+
async function fetchGitHubMarkdownFiles(
|
|
80
|
+
repo: string,
|
|
81
|
+
path: string
|
|
82
|
+
): Promise<GitHubContent[]> {
|
|
83
|
+
const url = `https://api.github.com/repos/${repo}/contents/${path}`;
|
|
84
|
+
|
|
85
|
+
const response = await fetch(url, {
|
|
86
|
+
headers: {
|
|
87
|
+
Accept: "application/vnd.github.v3+json",
|
|
88
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
89
|
+
},
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
if (!response.ok) {
|
|
93
|
+
throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const contents: GitHubContent[] = await response.json();
|
|
97
|
+
|
|
98
|
+
// Filter for markdown files only
|
|
99
|
+
return contents.filter(
|
|
100
|
+
(item) => item.type === "file" && item.name.endsWith(".md")
|
|
101
|
+
);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Download markdown content from GitHub raw URL
|
|
106
|
+
*/
|
|
107
|
+
async function fetchGitHubRawContent(
|
|
108
|
+
repo: string,
|
|
109
|
+
path: string
|
|
110
|
+
): Promise<string | null> {
|
|
111
|
+
const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const response = await fetch(url, {
|
|
115
|
+
headers: {
|
|
116
|
+
Accept: "text/plain",
|
|
117
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper",
|
|
118
|
+
},
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
if (!response.ok) {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return await response.text();
|
|
126
|
+
} catch (error) {
|
|
127
|
+
console.error(`Error fetching ${url}:`, error);
|
|
128
|
+
return null;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Extract title from markdown content
|
|
134
|
+
*/
|
|
135
|
+
function extractTitle(markdown: string): string | null {
|
|
136
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
137
|
+
return titleMatch ? titleMatch[1].trim() : null;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Save file to disk
|
|
142
|
+
*/
|
|
143
|
+
async function saveFile(
|
|
144
|
+
outputDir: string,
|
|
145
|
+
filename: string,
|
|
146
|
+
content: string
|
|
147
|
+
): Promise<void> {
|
|
148
|
+
const fs = await import("fs/promises");
|
|
149
|
+
const path = await import("path");
|
|
150
|
+
|
|
151
|
+
const outputPath = path.join(outputDir, filename);
|
|
152
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
153
|
+
await fs.writeFile(outputPath, content, "utf-8");
|
|
154
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scrapers Module
|
|
3
|
+
*
|
|
4
|
+
* Composable scraper architecture for multiple documentation source types.
|
|
5
|
+
* This module provides a registry-based system for different scraper implementations.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Types
|
|
9
|
+
export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
|
|
10
|
+
|
|
11
|
+
// Scrapers
|
|
12
|
+
export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
|
|
13
|
+
export { githubRawScraper } from "./github-raw";
|
|
14
|
+
|
|
15
|
+
// Registry
|
|
16
|
+
export { registerScraper, getScraper, scrapeSource } from "./registry";
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLMS-TXT Scraper
|
|
3
|
+
*
|
|
4
|
+
* Scrapes documentation sites that provide llms.txt index files.
|
|
5
|
+
* Uses the core MarkdownDocsScraper under the hood.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { scrapeMarkdownDocs, type DocPage } from "../index";
|
|
9
|
+
import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
|
|
10
|
+
|
|
11
|
+
// ============================================================================
|
|
12
|
+
// URL PATTERNS
|
|
13
|
+
// ============================================================================
|
|
14
|
+
|
|
15
|
+
/** Pattern for Claude Code docs: /docs/en/page.md */
|
|
16
|
+
export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
17
|
+
|
|
18
|
+
/** Pattern for generic docs: any domain/path.md */
|
|
19
|
+
export const GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// LLMS-TXT SCRAPER
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
export const llmsTxtScraper: Scraper = {
|
|
26
|
+
type: "llms-txt",
|
|
27
|
+
|
|
28
|
+
async scrape(config: SourceConfig): Promise<ScrapeResult> {
|
|
29
|
+
const options = getScraperOptions(config);
|
|
30
|
+
const result = await scrapeMarkdownDocs(options);
|
|
31
|
+
|
|
32
|
+
// Convert DocPage[] to DownloadResult[]
|
|
33
|
+
const downloaded: DownloadResult[] = result.downloaded.map((page: DocPage) => {
|
|
34
|
+
const category = page.category || "";
|
|
35
|
+
const filename = `${page.pageName || "untitled"}.md`;
|
|
36
|
+
const path = category ? `${category}/${filename}` : filename;
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
success: true,
|
|
40
|
+
path,
|
|
41
|
+
title: page.title,
|
|
42
|
+
};
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
downloaded,
|
|
47
|
+
failed: result.failed,
|
|
48
|
+
duration: result.duration,
|
|
49
|
+
};
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// ============================================================================
|
|
54
|
+
// OPTIONS BUILDER
|
|
55
|
+
// ============================================================================
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Get scraper options based on source configuration
|
|
59
|
+
*/
|
|
60
|
+
function getScraperOptions(config: SourceConfig) {
|
|
61
|
+
const baseOptions = {
|
|
62
|
+
baseUrl: config.baseUrl,
|
|
63
|
+
docsPath: config.docsPath,
|
|
64
|
+
outputDir: config.outputDir,
|
|
65
|
+
concurrency: 10,
|
|
66
|
+
useLlms: true,
|
|
67
|
+
tryDocsSubdomain: false,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Source-specific options
|
|
71
|
+
if (config.name === "Claude Code") {
|
|
72
|
+
return {
|
|
73
|
+
...baseOptions,
|
|
74
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
75
|
+
linkPattern: CLAUDE_CODE_PATTERN,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (config.name === "Polymarket") {
|
|
80
|
+
return {
|
|
81
|
+
...baseOptions,
|
|
82
|
+
llmsPaths: ["/llms.txt"],
|
|
83
|
+
linkPattern: GENERIC_PATTERN,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (config.name === "Bun") {
|
|
88
|
+
return {
|
|
89
|
+
...baseOptions,
|
|
90
|
+
llmsPaths: ["/docs/llms.txt", "/llms.txt"],
|
|
91
|
+
linkPattern: GENERIC_PATTERN,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Default: use provided llmsTxtPath or try common paths
|
|
96
|
+
return {
|
|
97
|
+
...baseOptions,
|
|
98
|
+
llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
|
|
99
|
+
linkPattern: config.linkPattern || GENERIC_PATTERN,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scraper Registry
|
|
3
|
+
*
|
|
4
|
+
* Maps source types to scraper implementations.
|
|
5
|
+
* Allows registering new scrapers and looking them up by type.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Scraper, SourceType, SourceConfig, ScrapeResult } from "./types";
|
|
9
|
+
import { llmsTxtScraper } from "./llms-txt";
|
|
10
|
+
import { githubRawScraper } from "./github-raw";
|
|
11
|
+
|
|
12
|
+
// ============================================================================
|
|
13
|
+
// SCRAPER REGISTRY
|
|
14
|
+
// ============================================================================
|
|
15
|
+
|
|
16
|
+
/** Registry of all available scrapers keyed by type */
|
|
17
|
+
const scrapers: Map<SourceType, Scraper> = new Map();
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Register a scraper implementation
|
|
21
|
+
*/
|
|
22
|
+
export function registerScraper(scraper: Scraper): void {
|
|
23
|
+
scrapers.set(scraper.type, scraper);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Get a scraper by type
|
|
28
|
+
*/
|
|
29
|
+
export function getScraper(type: SourceType): Scraper | undefined {
|
|
30
|
+
return scrapers.get(type);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Scrape a source using the appropriate scraper
|
|
35
|
+
*/
|
|
36
|
+
export async function scrapeSource(config: SourceConfig): Promise<ScrapeResult> {
|
|
37
|
+
const scraper = scrapers.get(config.sourceType);
|
|
38
|
+
|
|
39
|
+
if (!scraper) {
|
|
40
|
+
throw new Error(`No scraper registered for type: ${config.sourceType}`);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return scraper.scrape(config);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// ============================================================================
|
|
47
|
+
// DEFAULT REGISTRATIONS
|
|
48
|
+
// ============================================================================
|
|
49
|
+
|
|
50
|
+
// Register built-in scrapers
|
|
51
|
+
registerScraper(llmsTxtScraper);
|
|
52
|
+
registerScraper(githubRawScraper);
|
|
53
|
+
|
|
54
|
+
// Export scrapers for direct access if needed
|
|
55
|
+
export { llmsTxtScraper, githubRawScraper };
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scraper Types
|
|
3
|
+
*
|
|
4
|
+
* Core types for the composable scraper architecture.
|
|
5
|
+
* These types define the interface that all scrapers must implement.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// ============================================================================
|
|
9
|
+
// SOURCE TYPES
|
|
10
|
+
// ============================================================================
|
|
11
|
+
|
|
12
|
+
/** Supported documentation source types */
|
|
13
|
+
export type SourceType = "llms-txt" | "github-raw";
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// SCRAPER INTERFACE
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
/** Result from scraping a source */
|
|
20
|
+
export interface ScrapeResult {
|
|
21
|
+
downloaded: DownloadResult[];
|
|
22
|
+
failed: Array<{ url: string; error: string }>;
|
|
23
|
+
duration?: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Individual download result */
|
|
27
|
+
export interface DownloadResult {
|
|
28
|
+
success: boolean;
|
|
29
|
+
path: string;
|
|
30
|
+
title?: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Scraper interface - all scrapers must implement this */
|
|
34
|
+
export interface Scraper {
|
|
35
|
+
/** Source type identifier */
|
|
36
|
+
type: SourceType;
|
|
37
|
+
|
|
38
|
+
/** Scrape documentation from a source */
|
|
39
|
+
scrape(config: SourceConfig): Promise<ScrapeResult>;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// ============================================================================
|
|
43
|
+
// SOURCE CONFIG
|
|
44
|
+
// ============================================================================
|
|
45
|
+
|
|
46
|
+
/** Source configuration */
|
|
47
|
+
export interface SourceConfig {
|
|
48
|
+
/** Display name */
|
|
49
|
+
name: string;
|
|
50
|
+
|
|
51
|
+
/** Source type - determines which scraper to use */
|
|
52
|
+
sourceType: SourceType;
|
|
53
|
+
|
|
54
|
+
/** Base URL for the documentation */
|
|
55
|
+
baseUrl: string;
|
|
56
|
+
|
|
57
|
+
/** Path to docs (e.g., /docs, /docs/en) */
|
|
58
|
+
docsPath: string;
|
|
59
|
+
|
|
60
|
+
/** Output directory for downloaded docs */
|
|
61
|
+
outputDir: string;
|
|
62
|
+
|
|
63
|
+
/** Output directory for daily reports */
|
|
64
|
+
reportDir: string;
|
|
65
|
+
|
|
66
|
+
/** llms.txt path (for llms-txt sources) */
|
|
67
|
+
llmsTxtPath?: string;
|
|
68
|
+
|
|
69
|
+
/** Custom link pattern for llms.txt parsing */
|
|
70
|
+
linkPattern?: RegExp;
|
|
71
|
+
|
|
72
|
+
/** GitHub config (for github-raw sources or GitHub API data) */
|
|
73
|
+
github?: {
|
|
74
|
+
repo: string;
|
|
75
|
+
includeCommits: boolean;
|
|
76
|
+
includeReleases: boolean;
|
|
77
|
+
includePRs: boolean;
|
|
78
|
+
};
|
|
79
|
+
}
|