@ebowwa/markdown-docs-scraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +148 -0
- package/dist/cli.js +2457 -0
- package/dist/index.js +247 -0
- package/package.json +51 -0
- package/src/cli.ts +99 -0
- package/src/index.ts +382 -0
package/README.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# @ebowwa/markdown-docs-scraper
|
|
2
|
+
|
|
3
|
+
> Scrape and mirror markdown-based documentation sites
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- 📥 Download full markdown documentation
|
|
8
|
+
- 🔄 Organize into directory structure
|
|
9
|
+
- 📊 Track downloads and failures
|
|
10
|
+
- 🚀 Fast concurrent downloads
|
|
11
|
+
- 🎯 CLI and programmatic API
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bun add @ebowwa/markdown-docs-scraper
|
|
17
|
+
# or
|
|
18
|
+
npm install @ebowwa/markdown-docs-scraper
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## CLI Usage
|
|
22
|
+
|
|
23
|
+
### Quick Start - Anthropic Docs
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
markdown-docs-scraper anthropic -o ./docs
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Scrape Any Site
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
markdown-docs-scraper scrape -u https://docs.example.com -o ./docs
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Discover Available Pages
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
markdown-docs-scraper discover -u https://code.claude.com
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Options
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Commands:
|
|
45
|
+
scrape Scrape documentation from a URL
|
|
46
|
+
discover Discover all available documentation pages
|
|
47
|
+
anthropic Quick scrape of Anthropic Claude Code docs
|
|
48
|
+
help Display help for command
|
|
49
|
+
|
|
50
|
+
Options:
|
|
51
|
+
-u, --url <url> Base URL of the documentation site
|
|
52
|
+
-o, --output <dir> Output directory (default: "./docs")
|
|
53
|
+
--docs-path <path> Docs path (default: "/docs/en")
|
|
54
|
+
-c, --concurrency <num> Concurrency level (default: "5")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Programmatic Usage
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
import { MarkdownDocsScraper } from "@ebowwa/markdown-docs-scraper";
|
|
61
|
+
|
|
62
|
+
const scraper = new MarkdownDocsScraper({
|
|
63
|
+
baseUrl: "https://code.claude.com",
|
|
64
|
+
docsPath: "/docs/en",
|
|
65
|
+
categories: {
|
|
66
|
+
"getting-started": ["introduction", "installation", "quick-start"],
|
|
67
|
+
features: ["inline-edits", "tool-use", "file-operations"],
|
|
68
|
+
},
|
|
69
|
+
outputDir: "./docs",
|
|
70
|
+
concurrency: 5,
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
const result = await scraper.scrape();
|
|
74
|
+
console.log(`Downloaded: ${result.downloaded.length}`);
|
|
75
|
+
console.log(`Failed: ${result.failed.length}`);
|
|
76
|
+
|
|
77
|
+
// Save pages to disk
|
|
78
|
+
await scraper.savePages(result.downloaded);
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Convenience Function
|
|
82
|
+
|
|
83
|
+
```typescript
|
|
84
|
+
import { scrapeMarkdownDocs } from "@ebowwa/markdown-docs-scraper";
|
|
85
|
+
|
|
86
|
+
const result = await scrapeMarkdownDocs({
|
|
87
|
+
baseUrl: "https://docs.example.com",
|
|
88
|
+
outputDir: "./docs",
|
|
89
|
+
});
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## API
|
|
93
|
+
|
|
94
|
+
### `MarkdownDocsScraper`
|
|
95
|
+
|
|
96
|
+
#### Constructor Options
|
|
97
|
+
|
|
98
|
+
```typescript
|
|
99
|
+
interface ScraperOptions {
|
|
100
|
+
baseUrl: string; // Base URL of the documentation site
|
|
101
|
+
docsPath?: string; // Docs path (default: "/docs/en")
|
|
102
|
+
categories?: Record<string, string[]>; // Categories and pages
|
|
103
|
+
outputDir?: string; // Output directory (default: "./docs")
|
|
104
|
+
concurrency?: number; // Concurrent downloads (default: 5)
|
|
105
|
+
onProgress?: (current: number, total: number) => void;
|
|
106
|
+
}
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
#### Methods
|
|
110
|
+
|
|
111
|
+
- `scrape()` - Scrape all configured pages
|
|
112
|
+
- `fetchMarkdown(url)` - Fetch markdown from a URL
|
|
113
|
+
- `downloadPage(category, page)` - Download a single page
|
|
114
|
+
- `savePages(pages)` - Save pages to disk
|
|
115
|
+
- `discoverPages()` - Discover available pages
|
|
116
|
+
|
|
117
|
+
#### Result
|
|
118
|
+
|
|
119
|
+
```typescript
|
|
120
|
+
interface ScraperResult {
|
|
121
|
+
downloaded: DocPage[]; // Successfully downloaded pages
|
|
122
|
+
failed: Array<{ url: string; error: string }>;
|
|
123
|
+
duration: number; // Duration in milliseconds
|
|
124
|
+
}
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Output Format
|
|
128
|
+
|
|
129
|
+
Each downloaded file includes a header comment:
|
|
130
|
+
|
|
131
|
+
```markdown
|
|
132
|
+
<!--
|
|
133
|
+
Source: https://code.claude.com/docs/en/introduction.md
|
|
134
|
+
Downloaded: 2026-02-06T00:00:00.000Z
|
|
135
|
+
-->
|
|
136
|
+
|
|
137
|
+
# Introduction
|
|
138
|
+
|
|
139
|
+
Original markdown content...
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
MIT
|
|
145
|
+
|
|
146
|
+
## Contributing
|
|
147
|
+
|
|
148
|
+
This package is part of the [codespaces](https://github.com/ebowwa/codespaces) monorepo.
|