@ebowwa/markdown-docs-scraper 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +1 -0
- package/dist/index.d.ts +128 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +219 -26
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +448 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +2 -0
- package/src/index.js +487 -0
- package/src/index.ts +115 -28
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
package/README.md
CHANGED
|
@@ -161,6 +161,88 @@ Downloaded: 2026-02-06T00:00:00.000Z
|
|
|
161
161
|
Original markdown content...
|
|
162
162
|
```
|
|
163
163
|
|
|
164
|
+
## Composable Scrapers Module
|
|
165
|
+
|
|
166
|
+
The package includes a composable scraper architecture for multiple documentation source types.
|
|
167
|
+
|
|
168
|
+
### Usage
|
|
169
|
+
|
|
170
|
+
```typescript
|
|
171
|
+
import {
|
|
172
|
+
scrapeSource,
|
|
173
|
+
registerScraper,
|
|
174
|
+
llmsTxtScraper,
|
|
175
|
+
githubRawScraper,
|
|
176
|
+
type SourceConfig,
|
|
177
|
+
} from "@ebowwa/markdown-docs-scraper/scrapers";
|
|
178
|
+
|
|
179
|
+
// Configure a source
|
|
180
|
+
const config: SourceConfig = {
|
|
181
|
+
name: "My Docs",
|
|
182
|
+
sourceType: "llms-txt",
|
|
183
|
+
baseUrl: "https://docs.example.com",
|
|
184
|
+
docsPath: "/docs",
|
|
185
|
+
outputDir: "./docs/my-docs",
|
|
186
|
+
reportDir: "./reports/my-docs",
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
// Scrape using the registry (auto-selects scraper by sourceType)
|
|
190
|
+
const result = await scrapeSource(config);
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Built-in Scrapers
|
|
194
|
+
|
|
195
|
+
- **llms-txt**: Scrapes docs sites with llms.txt index files
|
|
196
|
+
- **github-raw**: Downloads markdown directly from GitHub repos
|
|
197
|
+
|
|
198
|
+
### Custom Scrapers
|
|
199
|
+
|
|
200
|
+
```typescript
|
|
201
|
+
import { registerScraper, type Scraper, type SourceType } from "@ebowwa/markdown-docs-scraper/scrapers";
|
|
202
|
+
|
|
203
|
+
const myScraper: Scraper = {
|
|
204
|
+
type: "my-type" as SourceType,
|
|
205
|
+
async scrape(config) {
|
|
206
|
+
// Custom scraping logic
|
|
207
|
+
return {
|
|
208
|
+
downloaded: [],
|
|
209
|
+
failed: [],
|
|
210
|
+
duration: 0,
|
|
211
|
+
};
|
|
212
|
+
},
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
registerScraper(myScraper);
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Types
|
|
219
|
+
|
|
220
|
+
```typescript
|
|
221
|
+
type SourceType = "llms-txt" | "github-raw";
|
|
222
|
+
|
|
223
|
+
interface SourceConfig {
|
|
224
|
+
name: string;
|
|
225
|
+
sourceType: SourceType;
|
|
226
|
+
baseUrl: string;
|
|
227
|
+
docsPath: string;
|
|
228
|
+
outputDir: string;
|
|
229
|
+
reportDir: string;
|
|
230
|
+
llmsTxtPath?: string;
|
|
231
|
+
linkPattern?: RegExp;
|
|
232
|
+
github?: {
|
|
233
|
+
repo: string;
|
|
234
|
+
includeCommits: boolean;
|
|
235
|
+
includeReleases: boolean;
|
|
236
|
+
includePRs: boolean;
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
interface Scraper {
|
|
241
|
+
type: SourceType;
|
|
242
|
+
scrape(config: SourceConfig): Promise<ScrapeResult>;
|
|
243
|
+
}
|
|
244
|
+
```
|
|
245
|
+
|
|
164
246
|
## License
|
|
165
247
|
|
|
166
248
|
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;GAEG"}
|
package/dist/cli.js
CHANGED
|
@@ -2426,6 +2426,7 @@ program.command("scrape").description("Scrape documentation from a URL").require
|
|
|
2426
2426
|
}
|
|
2427
2427
|
});
|
|
2428
2428
|
program.command("discover").description("Discover all available documentation pages").requiredOption("-u, --url <url>", "Base URL of the documentation site").option("--docs-path <path>", "Docs path (default: /docs/en)", "/docs/en").option("--llms-paths <paths>", "Comma-separated llms.txt paths to try", "/llms.txt,/docs/llms.txt").option("--no-subdomain", "Disable docs/doc subdomain fallback", false).action(async (options) => {
|
|
2429
|
+
console.log("DEBUG CLI: Options received:", options);
|
|
2429
2430
|
const scraper = new import__2.MarkdownDocsScraper({
|
|
2430
2431
|
baseUrl: options.url,
|
|
2431
2432
|
docsPath: options.docsPath,
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @ebowwa/markdown-docs-scraper
|
|
3
|
+
*
|
|
4
|
+
* Composable markdown documentation scraper.
|
|
5
|
+
* - Configurable llms.txt paths with fallbacks
|
|
6
|
+
* - Custom URL patterns for different doc sites
|
|
7
|
+
* - Works with any markdown documentation site
|
|
8
|
+
* - Uses full URLs from llms.txt directly
|
|
9
|
+
*/
|
|
10
|
+
export interface DocPage {
|
|
11
|
+
url: string;
|
|
12
|
+
title: string;
|
|
13
|
+
content: string;
|
|
14
|
+
category?: string;
|
|
15
|
+
pageName?: string;
|
|
16
|
+
}
|
|
17
|
+
export interface ScraperOptions {
|
|
18
|
+
baseUrl: string;
|
|
19
|
+
docsPath?: string;
|
|
20
|
+
categories?: Record<string, string[]>;
|
|
21
|
+
outputDir?: string;
|
|
22
|
+
concurrency?: number;
|
|
23
|
+
onProgress?: (current: number, total: number) => void;
|
|
24
|
+
/** Custom llms.txt paths to try (default: ["/llms.txt", "/docs/llms.txt"]) */
|
|
25
|
+
llmsPaths?: string[];
|
|
26
|
+
/** Also try docs subdomain variants (e.g., docs.example.com) */
|
|
27
|
+
tryDocsSubdomain?: boolean;
|
|
28
|
+
/** Custom regex pattern to extract pages from llms.txt (must have 3 capture groups: title, fullUrl, path) */
|
|
29
|
+
linkPattern?: RegExp;
|
|
30
|
+
/** Use full URLs from llms.txt directly (default: true for generic pattern) */
|
|
31
|
+
useDirectUrls?: boolean;
|
|
32
|
+
}
|
|
33
|
+
export interface ScraperResult {
|
|
34
|
+
downloaded: DocPage[];
|
|
35
|
+
failed: Array<{
|
|
36
|
+
url: string;
|
|
37
|
+
error: string;
|
|
38
|
+
}>;
|
|
39
|
+
duration: number;
|
|
40
|
+
}
|
|
41
|
+
/** Discovered page with full URL */
|
|
42
|
+
interface DiscoveredPage {
|
|
43
|
+
category: string;
|
|
44
|
+
page: string;
|
|
45
|
+
fullUrl: string;
|
|
46
|
+
}
|
|
47
|
+
/** Extract title from markdown content */
|
|
48
|
+
export declare function extractTitle(markdown: string): string;
|
|
49
|
+
/** Parse page path into category and page name */
|
|
50
|
+
export declare function parsePagePath(pagePath: string): {
|
|
51
|
+
category: string;
|
|
52
|
+
page: string;
|
|
53
|
+
};
|
|
54
|
+
/** Fetch markdown content from URL */
|
|
55
|
+
export declare function fetchMarkdown(url: string, userAgent?: string): Promise<string | null>;
|
|
56
|
+
export declare class MarkdownDocsScraper {
|
|
57
|
+
private options;
|
|
58
|
+
constructor(options: ScraperOptions);
|
|
59
|
+
/**
|
|
60
|
+
* Build URL for a documentation page (fallback when no direct URL)
|
|
61
|
+
*/
|
|
62
|
+
buildUrl(category: string, page: string): string;
|
|
63
|
+
/**
|
|
64
|
+
* Download a page using either direct URL or built URL
|
|
65
|
+
*/
|
|
66
|
+
downloadPage(pageInfo: DiscoveredPage): Promise<DocPage | null>;
|
|
67
|
+
/**
|
|
68
|
+
* Generate possible llms.txt URLs to try
|
|
69
|
+
*/
|
|
70
|
+
private getLlmsUrls;
|
|
71
|
+
/**
|
|
72
|
+
* Fetch llms.txt from multiple possible URLs with fallback
|
|
73
|
+
*/
|
|
74
|
+
private fetchLlmsTxt;
|
|
75
|
+
/**
|
|
76
|
+
* Discover pages from llms.txt index
|
|
77
|
+
*/
|
|
78
|
+
discoverPages(): Promise<DiscoveredPage[]>;
|
|
79
|
+
/**
|
|
80
|
+
* Scrape pages discovered from llms.txt
|
|
81
|
+
*/
|
|
82
|
+
scrapeFromLlms(): Promise<ScraperResult>;
|
|
83
|
+
/**
|
|
84
|
+
* Scrape all documentation pages (uses categories)
|
|
85
|
+
*/
|
|
86
|
+
scrape(): Promise<ScraperResult>;
|
|
87
|
+
/**
|
|
88
|
+
* Extract body content from a file (strips header comment)
|
|
89
|
+
*/
|
|
90
|
+
private extractBody;
|
|
91
|
+
/**
|
|
92
|
+
* Save scraped pages to disk (only writes if content changed)
|
|
93
|
+
*/
|
|
94
|
+
savePages(pages: DocPage[]): Promise<{
|
|
95
|
+
updated: number;
|
|
96
|
+
skipped: number;
|
|
97
|
+
}>;
|
|
98
|
+
/**
|
|
99
|
+
* Get list of pages to scrape based on categories
|
|
100
|
+
*/
|
|
101
|
+
private getPagesToScrape;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Scrape markdown documentation with a single function call
|
|
105
|
+
*/
|
|
106
|
+
export declare function scrapeMarkdownDocs(options: ScraperOptions & {
|
|
107
|
+
useLlms?: boolean;
|
|
108
|
+
}): Promise<ScraperResult & {
|
|
109
|
+
saveStats?: {
|
|
110
|
+
updated: number;
|
|
111
|
+
skipped: number;
|
|
112
|
+
};
|
|
113
|
+
}>;
|
|
114
|
+
/** Pattern for Claude Code docs: /docs/en/page.md */
|
|
115
|
+
export declare const CLAUDE_CODE_PATTERN: RegExp;
|
|
116
|
+
/** Pattern for generic docs: any domain/path.md */
|
|
117
|
+
export declare const GENERIC_PATTERN: RegExp;
|
|
118
|
+
/** Create scraper options for Claude Code docs */
|
|
119
|
+
export declare function claudeCodeOptions(outputDir: string): ScraperOptions;
|
|
120
|
+
/** Create scraper options for Polymarket docs */
|
|
121
|
+
export declare function polymarketOptions(outputDir: string): ScraperOptions;
|
|
122
|
+
/**
|
|
123
|
+
* Re-export scrapers module for composable scraper architecture.
|
|
124
|
+
* This provides a registry-based system for different scraper implementations.
|
|
125
|
+
*/
|
|
126
|
+
export { type SourceType, type SourceConfig, type Scraper, type ScrapeResult as ScraperModuleResult, type DownloadResult, llmsTxtScraper, githubRawScraper, CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN, GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN, registerScraper, getScraper, scrapeSource, } from "./scrapers/index";
|
|
127
|
+
export default MarkdownDocsScraper;
|
|
128
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAMH,MAAM,WAAW,OAAO;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;IACtC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACtD,8EAA8E;IAC9E,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,gEAAgE;IAChE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,6GAA6G;IAC7G,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+EAA+E;IAC/E,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,UAAU,EAAE,OAAO,EAAE,CAAC;IACtB,MAAM,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9C,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,oCAAoC;AACpC,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;CACjB;AAYD,0CAA0C;AAC1C,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAGrD;AAED,kDAAkD;AAClD,wBAAgB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAkBlF;AAED,sCAAsC;AACtC,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,SAAkC,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAkBpH;AAMD,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,OAAO,CAA2B;gBAE9B,OAAO,EAAE,cAAc;IAenC;;OAEG;IACH,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM;IAUhD;;OAEG;IACG,YAAY,CAAC,QAAQ,EAAE,cAAc,GAAG,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAqBrE;;OAEG;IACH,OAAO,CAAC,WAAW;IA6BnB;;OAEG;YACW,YAAY;IA6B1B;;OAEG;IACG,aAAa,IAAI,OAAO,CAAC,cAAc,EAAE,CAAC;IAuChD;;OAEG;IACG,cAAc,IAAI,OAAO,CAAC,aAAa,CAAC;IA+C9C;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,aAAa,CAAC;IAwCtC;;OAEG;IACH,OAAO,CAAC,WAAW;IAMnB;;OAEG;IACG,SAAS,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC;IAyChF;;OAEG;IACH,OAAO,CAAC,gBAAgB;CAWzB;AAMD;;GAEG;AACH,wBAAsB,kBAAkB,CACtC,OAAO,EAAE,cAAc,GAAG;IAAE,OAAO,CAAC,EAAE,OAAO,CAAA;CAAE,GAC9C,OAAO,CAAC,aAAa,GAAG;IAAE,SAAS,CAAC,EAAE;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAA;CAAE,CAAC,CAe/E;AAMD,qDAAqD;AACrD,eAAO,MAAM,mBAAmB,QAAiE,CAAC;AAElG,mDAAmD;AACnD,eAAO,MAAM,eAAe,QAAuB,CAAC;AAEpD,kDAAkD;AAClD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAED,iDAAiD;AACjD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,cAAc,CAWnE;AAMD;;;GAGG;AACH,OAAO,EAEL,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,OAAO,EACZ,KAAK,YAAY,IAAI,mBAAmB,EACxC,KAAK,cAAc,EAGnB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,IAAI,2BAA2B,EAClD,eAAe,IAAI,uBAAuB,EAG1C,eAAe,EACf,UAAU,EACV,YAAY,GACb,MAAM,kBAAkB,CAAC;AAM1B,eAAe,mBAAmB,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -17,9 +17,164 @@ var __toESM = (mod, isNodeMode, target) => {
|
|
|
17
17
|
};
|
|
18
18
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
19
19
|
|
|
20
|
-
// src/
|
|
21
|
-
var
|
|
20
|
+
// src/scrapers/llms-txt.ts
|
|
21
|
+
var CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
22
|
+
var GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
23
|
+
var llmsTxtScraper = {
|
|
24
|
+
type: "llms-txt",
|
|
25
|
+
async scrape(config) {
|
|
26
|
+
const options = getScraperOptions(config);
|
|
27
|
+
const result = await scrapeMarkdownDocs(options);
|
|
28
|
+
const downloaded = result.downloaded.map((page) => {
|
|
29
|
+
const category = page.category || "";
|
|
30
|
+
const filename = `${page.pageName || "untitled"}.md`;
|
|
31
|
+
const path = category ? `${category}/${filename}` : filename;
|
|
32
|
+
return {
|
|
33
|
+
success: true,
|
|
34
|
+
path,
|
|
35
|
+
title: page.title
|
|
36
|
+
};
|
|
37
|
+
});
|
|
38
|
+
return {
|
|
39
|
+
downloaded,
|
|
40
|
+
failed: result.failed,
|
|
41
|
+
duration: result.duration
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
function getScraperOptions(config) {
|
|
46
|
+
const baseOptions = {
|
|
47
|
+
baseUrl: config.baseUrl,
|
|
48
|
+
docsPath: config.docsPath,
|
|
49
|
+
outputDir: config.outputDir,
|
|
50
|
+
concurrency: 10,
|
|
51
|
+
useLlms: true,
|
|
52
|
+
tryDocsSubdomain: false
|
|
53
|
+
};
|
|
54
|
+
if (config.name === "Claude Code") {
|
|
55
|
+
return {
|
|
56
|
+
...baseOptions,
|
|
57
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
58
|
+
linkPattern: CLAUDE_CODE_PATTERN
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
if (config.name === "Polymarket") {
|
|
62
|
+
return {
|
|
63
|
+
...baseOptions,
|
|
64
|
+
llmsPaths: ["/llms.txt"],
|
|
65
|
+
linkPattern: GENERIC_PATTERN
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
if (config.name === "Bun") {
|
|
69
|
+
return {
|
|
70
|
+
...baseOptions,
|
|
71
|
+
llmsPaths: ["/docs/llms.txt", "/llms.txt"],
|
|
72
|
+
linkPattern: GENERIC_PATTERN
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
return {
|
|
76
|
+
...baseOptions,
|
|
77
|
+
llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
|
|
78
|
+
linkPattern: config.linkPattern || GENERIC_PATTERN
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
// src/scrapers/github-raw.ts
|
|
82
|
+
var githubRawScraper = {
|
|
83
|
+
type: "github-raw",
|
|
84
|
+
async scrape(config) {
|
|
85
|
+
const startTime = Date.now();
|
|
86
|
+
const downloaded = [];
|
|
87
|
+
const failed = [];
|
|
88
|
+
if (!config.github?.repo) {
|
|
89
|
+
throw new Error(`GitHub source "${config.name}" missing github.repo config`);
|
|
90
|
+
}
|
|
91
|
+
const files = await fetchGitHubMarkdownFiles(config.github.repo, config.docsPath.replace(/^\//, ""));
|
|
92
|
+
for (const file of files) {
|
|
93
|
+
const content = await fetchGitHubRawContent(config.github.repo, file.path);
|
|
94
|
+
if (content) {
|
|
95
|
+
downloaded.push({
|
|
96
|
+
success: true,
|
|
97
|
+
path: file.name,
|
|
98
|
+
title: extractTitle(content) || file.name.replace(".md", "")
|
|
99
|
+
});
|
|
100
|
+
await saveFile(config.outputDir, file.name, content);
|
|
101
|
+
} else {
|
|
102
|
+
failed.push({
|
|
103
|
+
url: `https://raw.githubusercontent.com/${config.github.repo}/main/${file.path}`,
|
|
104
|
+
error: "Failed to fetch content"
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return {
|
|
109
|
+
downloaded,
|
|
110
|
+
failed,
|
|
111
|
+
duration: Date.now() - startTime
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
};
|
|
115
|
+
async function fetchGitHubMarkdownFiles(repo, path) {
|
|
116
|
+
const url = `https://api.github.com/repos/${repo}/contents/${path}`;
|
|
117
|
+
const response = await fetch(url, {
|
|
118
|
+
headers: {
|
|
119
|
+
Accept: "application/vnd.github.v3+json",
|
|
120
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
121
|
+
}
|
|
122
|
+
});
|
|
123
|
+
if (!response.ok) {
|
|
124
|
+
throw new Error(`GitHub API error: ${response.status} ${response.statusText}`);
|
|
125
|
+
}
|
|
126
|
+
const contents = await response.json();
|
|
127
|
+
return contents.filter((item) => item.type === "file" && item.name.endsWith(".md"));
|
|
128
|
+
}
|
|
129
|
+
async function fetchGitHubRawContent(repo, path) {
|
|
130
|
+
const url = `https://raw.githubusercontent.com/${repo}/main/${path}`;
|
|
131
|
+
try {
|
|
132
|
+
const response = await fetch(url, {
|
|
133
|
+
headers: {
|
|
134
|
+
Accept: "text/plain",
|
|
135
|
+
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
if (!response.ok) {
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
return await response.text();
|
|
142
|
+
} catch (error) {
|
|
143
|
+
console.error(`Error fetching ${url}:`, error);
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
22
147
|
function extractTitle(markdown) {
|
|
148
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
149
|
+
return titleMatch ? titleMatch[1].trim() : null;
|
|
150
|
+
}
|
|
151
|
+
async function saveFile(outputDir, filename, content) {
|
|
152
|
+
const fs = await import("fs/promises");
|
|
153
|
+
const path = await import("path");
|
|
154
|
+
const outputPath = path.join(outputDir, filename);
|
|
155
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
156
|
+
await fs.writeFile(outputPath, content, "utf-8");
|
|
157
|
+
}
|
|
158
|
+
// src/scrapers/registry.ts
|
|
159
|
+
var scrapers = new Map;
|
|
160
|
+
function registerScraper(scraper) {
|
|
161
|
+
scrapers.set(scraper.type, scraper);
|
|
162
|
+
}
|
|
163
|
+
function getScraper(type) {
|
|
164
|
+
return scrapers.get(type);
|
|
165
|
+
}
|
|
166
|
+
async function scrapeSource(config) {
|
|
167
|
+
const scraper = scrapers.get(config.sourceType);
|
|
168
|
+
if (!scraper) {
|
|
169
|
+
throw new Error(`No scraper registered for type: ${config.sourceType}`);
|
|
170
|
+
}
|
|
171
|
+
return scraper.scrape(config);
|
|
172
|
+
}
|
|
173
|
+
registerScraper(llmsTxtScraper);
|
|
174
|
+
registerScraper(githubRawScraper);
|
|
175
|
+
// src/index.ts
|
|
176
|
+
var GENERIC_LINK_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
177
|
+
function extractTitle2(markdown) {
|
|
23
178
|
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
24
179
|
return titleMatch ? titleMatch[1].trim() : "Untitled";
|
|
25
180
|
}
|
|
@@ -67,7 +222,8 @@ class MarkdownDocsScraper {
|
|
|
67
222
|
onProgress: options.onProgress || (() => {}),
|
|
68
223
|
llmsPaths: options.llmsPaths || ["/llms.txt", "/docs/llms.txt"],
|
|
69
224
|
tryDocsSubdomain: options.tryDocsSubdomain ?? true,
|
|
70
|
-
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN
|
|
225
|
+
linkPattern: options.linkPattern || GENERIC_LINK_PATTERN,
|
|
226
|
+
useDirectUrls: options.useDirectUrls ?? true
|
|
71
227
|
};
|
|
72
228
|
}
|
|
73
229
|
buildUrl(category, page) {
|
|
@@ -79,18 +235,18 @@ class MarkdownDocsScraper {
|
|
|
79
235
|
return `${this.options.baseUrl}/${page}.md`;
|
|
80
236
|
}
|
|
81
237
|
}
|
|
82
|
-
async downloadPage(
|
|
83
|
-
const url = this.buildUrl(category, page);
|
|
238
|
+
async downloadPage(pageInfo) {
|
|
239
|
+
const url = this.options.useDirectUrls && pageInfo.fullUrl ? pageInfo.fullUrl : this.buildUrl(pageInfo.category, pageInfo.page);
|
|
84
240
|
const content = await fetchMarkdown(url);
|
|
85
241
|
if (!content) {
|
|
86
242
|
return null;
|
|
87
243
|
}
|
|
88
244
|
return {
|
|
89
245
|
url,
|
|
90
|
-
title:
|
|
246
|
+
title: extractTitle2(content),
|
|
91
247
|
content,
|
|
92
|
-
category,
|
|
93
|
-
pageName: page
|
|
248
|
+
category: pageInfo.category,
|
|
249
|
+
pageName: pageInfo.page
|
|
94
250
|
};
|
|
95
251
|
}
|
|
96
252
|
getLlmsUrls() {
|
|
@@ -114,20 +270,24 @@ class MarkdownDocsScraper {
|
|
|
114
270
|
}
|
|
115
271
|
async fetchLlmsTxt() {
|
|
116
272
|
const urls = this.getLlmsUrls();
|
|
273
|
+
console.log(`DEBUG: Trying URLs: ${urls.join(", ")}`);
|
|
117
274
|
for (const llmsUrl of urls) {
|
|
118
275
|
try {
|
|
276
|
+
console.log(`DEBUG: Fetching ${llmsUrl}...`);
|
|
119
277
|
const response = await fetch(llmsUrl, {
|
|
120
278
|
headers: {
|
|
121
279
|
Accept: "text/plain",
|
|
122
280
|
"User-Agent": "@ebowwa/markdown-docs-scraper"
|
|
123
281
|
}
|
|
124
282
|
});
|
|
283
|
+
console.log(`DEBUG: Response status: ${response.status}`);
|
|
125
284
|
if (response.ok) {
|
|
126
285
|
const content = await response.text();
|
|
127
286
|
console.log(`Found llms.txt at ${llmsUrl}`);
|
|
128
287
|
return { content, url: llmsUrl };
|
|
129
288
|
}
|
|
130
289
|
} catch (error) {
|
|
290
|
+
console.log(`DEBUG: Error: ${error}`);
|
|
131
291
|
continue;
|
|
132
292
|
}
|
|
133
293
|
}
|
|
@@ -143,13 +303,16 @@ class MarkdownDocsScraper {
|
|
|
143
303
|
return pages;
|
|
144
304
|
}
|
|
145
305
|
const { content } = llmsResult;
|
|
146
|
-
const
|
|
306
|
+
const pattern = this.options.linkPattern;
|
|
307
|
+
const regex = new RegExp(pattern.source, pattern.flags);
|
|
147
308
|
let match;
|
|
309
|
+
console.log(`DEBUG: Using pattern: ${pattern.source}`);
|
|
310
|
+
console.log(`DEBUG: Content length: ${content.length}`);
|
|
148
311
|
while ((match = regex.exec(content)) !== null) {
|
|
149
|
-
const
|
|
312
|
+
const fullUrl = match[2];
|
|
150
313
|
const pagePath = match[3];
|
|
151
314
|
const { category, page } = parsePagePath(pagePath);
|
|
152
|
-
pages.push({ category, page });
|
|
315
|
+
pages.push({ category, page, fullUrl });
|
|
153
316
|
}
|
|
154
317
|
console.log(`Discovered ${pages.length} pages from llms.txt`);
|
|
155
318
|
} catch (error) {
|
|
@@ -169,14 +332,15 @@ class MarkdownDocsScraper {
|
|
|
169
332
|
console.log(`Scraping ${pages.length} discovered pages...`);
|
|
170
333
|
for (let i = 0;i < pages.length; i += this.options.concurrency) {
|
|
171
334
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
172
|
-
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page
|
|
335
|
+
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page)));
|
|
173
336
|
results.forEach((result, index) => {
|
|
174
337
|
const page = batch[index];
|
|
175
338
|
if (result.status === "fulfilled" && result.value) {
|
|
176
339
|
downloaded.push(result.value);
|
|
177
340
|
} else {
|
|
341
|
+
const url = this.options.useDirectUrls && page.fullUrl ? page.fullUrl : this.buildUrl(page.category, page.page);
|
|
178
342
|
failed.push({
|
|
179
|
-
url
|
|
343
|
+
url,
|
|
180
344
|
error: result.status === "rejected" ? result.reason : "Not found"
|
|
181
345
|
});
|
|
182
346
|
}
|
|
@@ -198,7 +362,7 @@ class MarkdownDocsScraper {
|
|
|
198
362
|
console.log(`Scraping ${total} pages from ${this.options.baseUrl}...`);
|
|
199
363
|
for (let i = 0;i < pages.length; i += this.options.concurrency) {
|
|
200
364
|
const batch = pages.slice(i, i + this.options.concurrency);
|
|
201
|
-
const results = await Promise.allSettled(batch.map((page) => this.downloadPage(page
|
|
365
|
+
const results = await Promise.allSettled(batch.map((page) => this.downloadPage({ ...page, fullUrl: "" })));
|
|
202
366
|
results.forEach((result, index) => {
|
|
203
367
|
const page = batch[index];
|
|
204
368
|
if (result.status === "fulfilled" && result.value) {
|
|
@@ -218,14 +382,28 @@ class MarkdownDocsScraper {
|
|
|
218
382
|
console.log(`⏱️ Duration: ${(duration / 1000).toFixed(2)}s`);
|
|
219
383
|
return { downloaded, failed, duration };
|
|
220
384
|
}
|
|
385
|
+
extractBody(content) {
|
|
386
|
+
const headerRegex = /^<!--\nSource: [^\n]+\nDownloaded: [^\n]+\n-->\n\n/;
|
|
387
|
+
return content.replace(headerRegex, "");
|
|
388
|
+
}
|
|
221
389
|
async savePages(pages) {
|
|
222
390
|
const fs = await import("fs/promises");
|
|
223
391
|
const path = await import("path");
|
|
392
|
+
let updated = 0;
|
|
393
|
+
let skipped = 0;
|
|
224
394
|
for (const page of pages) {
|
|
225
395
|
const nameToUse = page.pageName || page.url.split("/").pop()?.replace(".md", "") || "untitled";
|
|
226
396
|
const dir = page.category ? path.join(this.options.outputDir, page.category) : this.options.outputDir;
|
|
227
397
|
await fs.mkdir(dir, { recursive: true });
|
|
228
398
|
const filepath = path.join(dir, `${nameToUse}.md`);
|
|
399
|
+
try {
|
|
400
|
+
const existingContent = await fs.readFile(filepath, "utf-8");
|
|
401
|
+
const existingBody = this.extractBody(existingContent);
|
|
402
|
+
if (existingBody === page.content) {
|
|
403
|
+
skipped++;
|
|
404
|
+
continue;
|
|
405
|
+
}
|
|
406
|
+
} catch {}
|
|
229
407
|
const header = `<!--
|
|
230
408
|
Source: ${page.url}
|
|
231
409
|
Downloaded: ${new Date().toISOString()}
|
|
@@ -233,13 +411,15 @@ Downloaded: ${new Date().toISOString()}
|
|
|
233
411
|
|
|
234
412
|
`;
|
|
235
413
|
await fs.writeFile(filepath, header + page.content, "utf-8");
|
|
414
|
+
updated++;
|
|
236
415
|
}
|
|
416
|
+
return { updated, skipped };
|
|
237
417
|
}
|
|
238
418
|
getPagesToScrape() {
|
|
239
419
|
const pages = [];
|
|
240
420
|
for (const [category, pageList] of Object.entries(this.options.categories)) {
|
|
241
421
|
for (const page of pageList) {
|
|
242
|
-
pages.push({ category, page });
|
|
422
|
+
pages.push({ category, page, fullUrl: "" });
|
|
243
423
|
}
|
|
244
424
|
}
|
|
245
425
|
return pages;
|
|
@@ -248,22 +428,27 @@ Downloaded: ${new Date().toISOString()}
|
|
|
248
428
|
async function scrapeMarkdownDocs(options) {
|
|
249
429
|
const scraper = new MarkdownDocsScraper(options);
|
|
250
430
|
const result = options.useLlms ? await scraper.scrapeFromLlms() : await scraper.scrape();
|
|
431
|
+
let saveStats;
|
|
251
432
|
if (options.outputDir) {
|
|
252
|
-
await scraper.savePages(result.downloaded);
|
|
433
|
+
saveStats = await scraper.savePages(result.downloaded);
|
|
434
|
+
if (saveStats.updated > 0 || saveStats.skipped > 0) {
|
|
435
|
+
console.log(` Saved: ${saveStats.updated} updated, ${saveStats.skipped} unchanged`);
|
|
436
|
+
}
|
|
253
437
|
}
|
|
254
|
-
return result;
|
|
438
|
+
return { ...result, saveStats };
|
|
255
439
|
}
|
|
256
|
-
var
|
|
257
|
-
var
|
|
440
|
+
var CLAUDE_CODE_PATTERN2 = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
441
|
+
var GENERIC_PATTERN2 = GENERIC_LINK_PATTERN;
|
|
258
442
|
function claudeCodeOptions(outputDir) {
|
|
259
443
|
return {
|
|
260
444
|
baseUrl: "https://code.claude.com",
|
|
261
445
|
docsPath: "/docs/en",
|
|
262
446
|
llmsPaths: ["/docs/llms.txt"],
|
|
263
|
-
linkPattern:
|
|
447
|
+
linkPattern: CLAUDE_CODE_PATTERN2,
|
|
264
448
|
outputDir,
|
|
265
449
|
concurrency: 10,
|
|
266
|
-
tryDocsSubdomain: false
|
|
450
|
+
tryDocsSubdomain: false,
|
|
451
|
+
useDirectUrls: false
|
|
267
452
|
};
|
|
268
453
|
}
|
|
269
454
|
function polymarketOptions(outputDir) {
|
|
@@ -271,22 +456,30 @@ function polymarketOptions(outputDir) {
|
|
|
271
456
|
baseUrl: "https://docs.polymarket.com",
|
|
272
457
|
docsPath: "",
|
|
273
458
|
llmsPaths: ["/llms.txt"],
|
|
274
|
-
linkPattern:
|
|
459
|
+
linkPattern: GENERIC_PATTERN2,
|
|
275
460
|
outputDir,
|
|
276
461
|
concurrency: 10,
|
|
277
|
-
tryDocsSubdomain: false
|
|
462
|
+
tryDocsSubdomain: false,
|
|
463
|
+
useDirectUrls: true
|
|
278
464
|
};
|
|
279
465
|
}
|
|
280
466
|
var src_default = MarkdownDocsScraper;
|
|
281
467
|
export {
|
|
468
|
+
scrapeSource,
|
|
282
469
|
scrapeMarkdownDocs,
|
|
470
|
+
registerScraper,
|
|
283
471
|
polymarketOptions,
|
|
284
472
|
parsePagePath,
|
|
473
|
+
llmsTxtScraper,
|
|
474
|
+
githubRawScraper,
|
|
475
|
+
getScraper,
|
|
285
476
|
fetchMarkdown,
|
|
286
|
-
extractTitle,
|
|
477
|
+
extractTitle2 as extractTitle,
|
|
287
478
|
src_default as default,
|
|
288
479
|
claudeCodeOptions,
|
|
480
|
+
GENERIC_PATTERN as SCRAPER_GENERIC_PATTERN,
|
|
481
|
+
CLAUDE_CODE_PATTERN as SCRAPER_CLAUDE_CODE_PATTERN,
|
|
289
482
|
MarkdownDocsScraper,
|
|
290
|
-
GENERIC_PATTERN,
|
|
291
|
-
CLAUDE_CODE_PATTERN
|
|
483
|
+
GENERIC_PATTERN2 as GENERIC_PATTERN,
|
|
484
|
+
CLAUDE_CODE_PATTERN2 as CLAUDE_CODE_PATTERN
|
|
292
485
|
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Raw Scraper
|
|
3
|
+
*
|
|
4
|
+
* Downloads markdown files directly from GitHub repositories via raw content URLs.
|
|
5
|
+
* Uses GitHub API to list files, then fetches each from raw.githubusercontent.com
|
|
6
|
+
*/
|
|
7
|
+
import type { Scraper } from "./types";
|
|
8
|
+
export declare const githubRawScraper: Scraper;
|
|
9
|
+
//# sourceMappingURL=github-raw.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github-raw.d.ts","sourceRoot":"","sources":["../../src/scrapers/github-raw.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,OAAO,EAA8C,MAAM,SAAS,CAAC;AAiBnF,eAAO,MAAM,gBAAgB,EAAE,OA6C9B,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scrapers Module
|
|
3
|
+
*
|
|
4
|
+
* Composable scraper architecture for multiple documentation source types.
|
|
5
|
+
* This module provides a registry-based system for different scraper implementations.
|
|
6
|
+
*/
|
|
7
|
+
export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
|
|
8
|
+
export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
|
|
9
|
+
export { githubRawScraper } from "./github-raw";
|
|
10
|
+
export { registerScraper, getScraper, scrapeSource } from "./registry";
|
|
11
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scrapers/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,YAAY,EAAE,UAAU,EAAE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AAG/F,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAClF,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAGhD,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC"}
|