@ebowwa/markdown-docs-scraper 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +19 -13
- package/dist/index.d.ts +116 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +323 -105
- package/dist/scrapers/github-raw.d.ts +9 -0
- package/dist/scrapers/github-raw.d.ts.map +1 -0
- package/dist/scrapers/index.d.ts +11 -0
- package/dist/scrapers/index.d.ts.map +1 -0
- package/dist/scrapers/index.js +428 -0
- package/dist/scrapers/llms-txt.d.ts +13 -0
- package/dist/scrapers/llms-txt.d.ts.map +1 -0
- package/dist/scrapers/registry.d.ts +23 -0
- package/dist/scrapers/registry.d.ts.map +1 -0
- package/dist/scrapers/types.d.ts +57 -0
- package/dist/scrapers/types.d.ts.map +1 -0
- package/package.json +10 -2
- package/src/cli.js +160 -0
- package/src/cli.ts +12 -1
- package/src/index.js +487 -0
- package/src/index.ts +276 -158
- package/src/scrapers/github-raw.ts +154 -0
- package/src/scrapers/index.ts +16 -0
- package/src/scrapers/llms-txt.ts +101 -0
- package/src/scrapers/registry.ts +55 -0
- package/src/scrapers/types.ts +79 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scrapers Module
|
|
3
|
+
*
|
|
4
|
+
* Composable scraper architecture for multiple documentation source types.
|
|
5
|
+
* This module provides a registry-based system for different scraper implementations.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Types
|
|
9
|
+
export type { SourceType, SourceConfig, Scraper, ScrapeResult, DownloadResult } from "./types";
|
|
10
|
+
|
|
11
|
+
// Scrapers
|
|
12
|
+
export { llmsTxtScraper, CLAUDE_CODE_PATTERN, GENERIC_PATTERN } from "./llms-txt";
|
|
13
|
+
export { githubRawScraper } from "./github-raw";
|
|
14
|
+
|
|
15
|
+
// Registry
|
|
16
|
+
export { registerScraper, getScraper, scrapeSource } from "./registry";
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLMS-TXT Scraper
|
|
3
|
+
*
|
|
4
|
+
* Scrapes documentation sites that provide llms.txt index files.
|
|
5
|
+
* Uses the core MarkdownDocsScraper under the hood.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { scrapeMarkdownDocs, type DocPage } from "../index";
|
|
9
|
+
import type { Scraper, SourceConfig, ScrapeResult, DownloadResult } from "./types";
|
|
10
|
+
|
|
11
|
+
// ============================================================================
|
|
12
|
+
// URL PATTERNS
|
|
13
|
+
// ============================================================================
|
|
14
|
+
|
|
15
|
+
/** Pattern for Claude Code docs: /docs/en/page.md */
|
|
16
|
+
export const CLAUDE_CODE_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^\s)]+\/docs\/en\/([^)]+\.md))\)/g;
|
|
17
|
+
|
|
18
|
+
/** Pattern for generic docs: any domain/path.md */
|
|
19
|
+
export const GENERIC_PATTERN = /\[([^\]]+)\]\((https?:\/\/[^/]+\/([^\s)]+\.md))\)/g;
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// LLMS-TXT SCRAPER
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
export const llmsTxtScraper: Scraper = {
|
|
26
|
+
type: "llms-txt",
|
|
27
|
+
|
|
28
|
+
async scrape(config: SourceConfig): Promise<ScrapeResult> {
|
|
29
|
+
const options = getScraperOptions(config);
|
|
30
|
+
const result = await scrapeMarkdownDocs(options);
|
|
31
|
+
|
|
32
|
+
// Convert DocPage[] to DownloadResult[]
|
|
33
|
+
const downloaded: DownloadResult[] = result.downloaded.map((page: DocPage) => {
|
|
34
|
+
const category = page.category || "";
|
|
35
|
+
const filename = `${page.pageName || "untitled"}.md`;
|
|
36
|
+
const path = category ? `${category}/${filename}` : filename;
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
success: true,
|
|
40
|
+
path,
|
|
41
|
+
title: page.title,
|
|
42
|
+
};
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
return {
|
|
46
|
+
downloaded,
|
|
47
|
+
failed: result.failed,
|
|
48
|
+
duration: result.duration,
|
|
49
|
+
};
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// ============================================================================
|
|
54
|
+
// OPTIONS BUILDER
|
|
55
|
+
// ============================================================================
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Get scraper options based on source configuration
|
|
59
|
+
*/
|
|
60
|
+
function getScraperOptions(config: SourceConfig) {
|
|
61
|
+
const baseOptions = {
|
|
62
|
+
baseUrl: config.baseUrl,
|
|
63
|
+
docsPath: config.docsPath,
|
|
64
|
+
outputDir: config.outputDir,
|
|
65
|
+
concurrency: 10,
|
|
66
|
+
useLlms: true,
|
|
67
|
+
tryDocsSubdomain: false,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Source-specific options
|
|
71
|
+
if (config.name === "Claude Code") {
|
|
72
|
+
return {
|
|
73
|
+
...baseOptions,
|
|
74
|
+
llmsPaths: ["/docs/llms.txt"],
|
|
75
|
+
linkPattern: CLAUDE_CODE_PATTERN,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (config.name === "Polymarket") {
|
|
80
|
+
return {
|
|
81
|
+
...baseOptions,
|
|
82
|
+
llmsPaths: ["/llms.txt"],
|
|
83
|
+
linkPattern: GENERIC_PATTERN,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (config.name === "Bun") {
|
|
88
|
+
return {
|
|
89
|
+
...baseOptions,
|
|
90
|
+
llmsPaths: ["/docs/llms.txt", "/llms.txt"],
|
|
91
|
+
linkPattern: GENERIC_PATTERN,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Default: use provided llmsTxtPath or try common paths
|
|
96
|
+
return {
|
|
97
|
+
...baseOptions,
|
|
98
|
+
llmsPaths: config.llmsTxtPath ? [config.llmsTxtPath] : ["/llms.txt", "/docs/llms.txt"],
|
|
99
|
+
linkPattern: config.linkPattern || GENERIC_PATTERN,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scraper Registry
|
|
3
|
+
*
|
|
4
|
+
* Maps source types to scraper implementations.
|
|
5
|
+
* Allows registering new scrapers and looking them up by type.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Scraper, SourceType, SourceConfig, ScrapeResult } from "./types";
|
|
9
|
+
import { llmsTxtScraper } from "./llms-txt";
|
|
10
|
+
import { githubRawScraper } from "./github-raw";
|
|
11
|
+
|
|
12
|
+
// ============================================================================
|
|
13
|
+
// SCRAPER REGISTRY
|
|
14
|
+
// ============================================================================
|
|
15
|
+
|
|
16
|
+
/** Registry of all available scrapers keyed by type */
|
|
17
|
+
const scrapers: Map<SourceType, Scraper> = new Map();
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Register a scraper implementation
|
|
21
|
+
*/
|
|
22
|
+
export function registerScraper(scraper: Scraper): void {
|
|
23
|
+
scrapers.set(scraper.type, scraper);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Get a scraper by type
|
|
28
|
+
*/
|
|
29
|
+
export function getScraper(type: SourceType): Scraper | undefined {
|
|
30
|
+
return scrapers.get(type);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Scrape a source using the appropriate scraper
|
|
35
|
+
*/
|
|
36
|
+
export async function scrapeSource(config: SourceConfig): Promise<ScrapeResult> {
|
|
37
|
+
const scraper = scrapers.get(config.sourceType);
|
|
38
|
+
|
|
39
|
+
if (!scraper) {
|
|
40
|
+
throw new Error(`No scraper registered for type: ${config.sourceType}`);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return scraper.scrape(config);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// ============================================================================
|
|
47
|
+
// DEFAULT REGISTRATIONS
|
|
48
|
+
// ============================================================================
|
|
49
|
+
|
|
50
|
+
// Register built-in scrapers
|
|
51
|
+
registerScraper(llmsTxtScraper);
|
|
52
|
+
registerScraper(githubRawScraper);
|
|
53
|
+
|
|
54
|
+
// Export scrapers for direct access if needed
|
|
55
|
+
export { llmsTxtScraper, githubRawScraper };
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scraper Types
|
|
3
|
+
*
|
|
4
|
+
* Core types for the composable scraper architecture.
|
|
5
|
+
* These types define the interface that all scrapers must implement.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// ============================================================================
|
|
9
|
+
// SOURCE TYPES
|
|
10
|
+
// ============================================================================
|
|
11
|
+
|
|
12
|
+
/** Supported documentation source types */
|
|
13
|
+
export type SourceType = "llms-txt" | "github-raw";
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// SCRAPER INTERFACE
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
/** Result from scraping a source */
|
|
20
|
+
export interface ScrapeResult {
|
|
21
|
+
downloaded: DownloadResult[];
|
|
22
|
+
failed: Array<{ url: string; error: string }>;
|
|
23
|
+
duration?: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Individual download result */
|
|
27
|
+
export interface DownloadResult {
|
|
28
|
+
success: boolean;
|
|
29
|
+
path: string;
|
|
30
|
+
title?: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Scraper interface - all scrapers must implement this */
|
|
34
|
+
export interface Scraper {
|
|
35
|
+
/** Source type identifier */
|
|
36
|
+
type: SourceType;
|
|
37
|
+
|
|
38
|
+
/** Scrape documentation from a source */
|
|
39
|
+
scrape(config: SourceConfig): Promise<ScrapeResult>;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// ============================================================================
|
|
43
|
+
// SOURCE CONFIG
|
|
44
|
+
// ============================================================================
|
|
45
|
+
|
|
46
|
+
/** Source configuration */
|
|
47
|
+
export interface SourceConfig {
|
|
48
|
+
/** Display name */
|
|
49
|
+
name: string;
|
|
50
|
+
|
|
51
|
+
/** Source type - determines which scraper to use */
|
|
52
|
+
sourceType: SourceType;
|
|
53
|
+
|
|
54
|
+
/** Base URL for the documentation */
|
|
55
|
+
baseUrl: string;
|
|
56
|
+
|
|
57
|
+
/** Path to docs (e.g., /docs, /docs/en) */
|
|
58
|
+
docsPath: string;
|
|
59
|
+
|
|
60
|
+
/** Output directory for downloaded docs */
|
|
61
|
+
outputDir: string;
|
|
62
|
+
|
|
63
|
+
/** Output directory for daily reports */
|
|
64
|
+
reportDir: string;
|
|
65
|
+
|
|
66
|
+
/** llms.txt path (for llms-txt sources) */
|
|
67
|
+
llmsTxtPath?: string;
|
|
68
|
+
|
|
69
|
+
/** Custom link pattern for llms.txt parsing */
|
|
70
|
+
linkPattern?: RegExp;
|
|
71
|
+
|
|
72
|
+
/** GitHub config (for github-raw sources or GitHub API data) */
|
|
73
|
+
github?: {
|
|
74
|
+
repo: string;
|
|
75
|
+
includeCommits: boolean;
|
|
76
|
+
includeReleases: boolean;
|
|
77
|
+
includePRs: boolean;
|
|
78
|
+
};
|
|
79
|
+
}
|