webpeel 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +415 -0
- package/dist/cli.d.ts +16 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +140 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/fetcher.d.ts +32 -0
- package/dist/core/fetcher.d.ts.map +1 -0
- package/dist/core/fetcher.js +479 -0
- package/dist/core/fetcher.js.map +1 -0
- package/dist/core/markdown.d.ts +17 -0
- package/dist/core/markdown.d.ts.map +1 -0
- package/dist/core/markdown.js +143 -0
- package/dist/core/markdown.js.map +1 -0
- package/dist/core/metadata.d.ts +17 -0
- package/dist/core/metadata.d.ts.map +1 -0
- package/dist/core/metadata.js +159 -0
- package/dist/core/metadata.js.map +1 -0
- package/dist/core/strategies.d.ts +30 -0
- package/dist/core/strategies.d.ts.map +1 -0
- package/dist/core/strategies.js +67 -0
- package/dist/core/strategies.js.map +1 -0
- package/dist/index.d.ts +31 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +81 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/server.js +248 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/server/app.d.ts +13 -0
- package/dist/server/app.d.ts.map +1 -0
- package/dist/server/app.js +89 -0
- package/dist/server/app.js.map +1 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.d.ts.map +1 -0
- package/dist/server/auth-store.js +87 -0
- package/dist/server/auth-store.js.map +1 -0
- package/dist/server/middleware/auth.d.ts +18 -0
- package/dist/server/middleware/auth.d.ts.map +1 -0
- package/dist/server/middleware/auth.js +55 -0
- package/dist/server/middleware/auth.js.map +1 -0
- package/dist/server/middleware/rate-limit.d.ts +23 -0
- package/dist/server/middleware/rate-limit.d.ts.map +1 -0
- package/dist/server/middleware/rate-limit.js +85 -0
- package/dist/server/middleware/rate-limit.js.map +1 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.d.ts.map +1 -0
- package/dist/server/routes/fetch.js +127 -0
- package/dist/server/routes/fetch.js.map +1 -0
- package/dist/server/routes/health.d.ts +6 -0
- package/dist/server/routes/health.d.ts.map +1 -0
- package/dist/server/routes/health.js +19 -0
- package/dist/server/routes/health.js.map +1 -0
- package/dist/server/routes/search.d.ts +7 -0
- package/dist/server/routes/search.d.ts.map +1 -0
- package/dist/server/routes/search.js +124 -0
- package/dist/server/routes/search.js.map +1 -0
- package/dist/types.d.ts +59 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +30 -0
- package/dist/types.js.map +1 -0
- package/llms.txt +60 -0
- package/package.json +80 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jake Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
# WebPeel
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/webpeel)
|
|
4
|
+
[](https://www.npmjs.com/package/webpeel)
|
|
5
|
+
[](https://github.com/JakeLiuMe/webpeel/actions/workflows/ci.yml)
|
|
6
|
+
[](https://www.typescriptlang.org/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
Turn any web page into clean markdown. Zero config. Free forever.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npx webpeel https://news.ycombinator.com
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
**Output:**
|
|
16
|
+
```markdown
|
|
17
|
+
# Hacker News
|
|
18
|
+
|
|
19
|
+
**New** | **Past** | **Comments** | **Ask** | **Show** | **Jobs** | **Submit**
|
|
20
|
+
|
|
21
|
+
## Top Stories
|
|
22
|
+
|
|
23
|
+
1. **Show HN: WebPeel – Turn any webpage into AI-ready markdown**
|
|
24
|
+
[https://github.com/JakeLiuMe/webpeel](https://github.com/JakeLiuMe/webpeel)
|
|
25
|
+
142 points by jakeliu 2 hours ago | 31 comments
|
|
26
|
+
|
|
27
|
+
2. **The End of the API Era**
|
|
28
|
+
...
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Why WebPeel?
|
|
34
|
+
|
|
35
|
+
| | **WebPeel** | Firecrawl | Jina Reader | MCP Fetch |
|
|
36
|
+
|---|:---:|:---:|:---:|:---:|
|
|
37
|
+
| **Local execution** | ✅ Free forever | ❌ Cloud only | ❌ Cloud only | ✅ Free |
|
|
38
|
+
| **JS rendering** | ✅ Auto-escalates | ✅ Always | ❌ No | ❌ No |
|
|
39
|
+
| **Anti-bot handling** | ✅ Stealth mode | ✅ Yes | ⚠️ Limited | ❌ No |
|
|
40
|
+
| **MCP Server** | ✅ Built-in | ✅ Separate repo | ❌ No | ✅ Yes |
|
|
41
|
+
| **Zero config** | ✅ `npx webpeel` | ❌ API key required | ❌ API key required | ✅ Yes |
|
|
42
|
+
| **Free tier** | ∞ Unlimited local | 500 pages/month | 1000 req/month | ∞ Local only |
|
|
43
|
+
| **Hosted API** | Coming soon | $16/mo (Starter) | $200/mo (Starter) | N/A |
|
|
44
|
+
| **Markdown output** | ✅ Optimized for AI | ✅ Yes | ✅ Yes | ⚠️ Basic |
|
|
45
|
+
|
|
46
|
+
**WebPeel gives you Firecrawl's power without the price tag.** Run locally for free, or use our hosted API when you need scale.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Quick Start
|
|
51
|
+
|
|
52
|
+
### CLI (Zero Install)
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Basic usage
|
|
56
|
+
npx webpeel https://example.com
|
|
57
|
+
|
|
58
|
+
# JSON output with metadata
|
|
59
|
+
npx webpeel https://example.com --json
|
|
60
|
+
|
|
61
|
+
# Force browser rendering (for JS-heavy sites)
|
|
62
|
+
npx webpeel https://x.com/elonmusk --render
|
|
63
|
+
|
|
64
|
+
# Wait for dynamic content
|
|
65
|
+
npx webpeel https://example.com --render --wait 3000
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Library (TypeScript)
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
npm install webpeel
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
```typescript
|
|
75
|
+
import { peel } from 'webpeel';
|
|
76
|
+
|
|
77
|
+
// Simple usage
|
|
78
|
+
const result = await peel('https://example.com');
|
|
79
|
+
console.log(result.content); // Clean markdown
|
|
80
|
+
console.log(result.metadata); // { title, description, author, ... }
|
|
81
|
+
console.log(result.tokens); // Estimated token count
|
|
82
|
+
|
|
83
|
+
// With options
|
|
84
|
+
const result = await peel('https://example.com', {
|
|
85
|
+
format: 'markdown', // 'markdown' | 'text' | 'html'
|
|
86
|
+
render: true, // Force browser mode
|
|
87
|
+
wait: 3000, // Wait 3s for dynamic content
|
|
88
|
+
timeout: 30000, // Request timeout (ms)
|
|
89
|
+
});
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### MCP Server (Claude Desktop, Cursor, VS Code)
|
|
93
|
+
|
|
94
|
+
WebPeel provides two MCP tools: `webpeel_fetch` (fetch a URL) and `webpeel_search` (DuckDuckGo search + fetch results).
|
|
95
|
+
|
|
96
|
+
#### Claude Desktop
|
|
97
|
+
|
|
98
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
99
|
+
|
|
100
|
+
```json
|
|
101
|
+
{
|
|
102
|
+
"mcpServers": {
|
|
103
|
+
"webpeel": {
|
|
104
|
+
"command": "npx",
|
|
105
|
+
"args": ["-y", "webpeel", "mcp"]
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
#### Cursor
|
|
112
|
+
|
|
113
|
+
Add to Cursor Settings → MCP Servers:
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"mcpServers": {
|
|
118
|
+
"webpeel": {
|
|
119
|
+
"command": "npx",
|
|
120
|
+
"args": ["-y", "webpeel", "mcp"]
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
#### VS Code (with Cline or other MCP clients)
|
|
127
|
+
|
|
128
|
+
Create or edit `~/.vscode/mcp.json`:
|
|
129
|
+
|
|
130
|
+
```json
|
|
131
|
+
{
|
|
132
|
+
"mcpServers": {
|
|
133
|
+
"webpeel": {
|
|
134
|
+
"command": "npx",
|
|
135
|
+
"args": ["-y", "webpeel", "mcp"]
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Or install with one click:
|
|
142
|
+
|
|
143
|
+
[](https://mcp.so/install/webpeel?for=claude)
|
|
144
|
+
[](https://mcp.so/install/webpeel?for=vscode)
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## How It Works: Smart Escalation
|
|
149
|
+
|
|
150
|
+
WebPeel tries the fastest method first, then escalates only when needed:
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
154
|
+
│ Smart Escalation │
|
|
155
|
+
└─────────────────────────────────────────────────────────────┘
|
|
156
|
+
|
|
157
|
+
Simple HTTP Fetch Browser Rendering Stealth Mode
|
|
158
|
+
~200ms ~2 seconds ~5 seconds
|
|
159
|
+
│ │ │
|
|
160
|
+
├─ User-Agent headers ├─ Full JS execution ├─ Anti-detect
|
|
161
|
+
├─ Cheerio parsing ├─ Wait for content ├─ Proxy rotation
|
|
162
|
+
├─ Fast & cheap ├─ Screenshots └─ Cloudflare bypass
|
|
163
|
+
│ │
|
|
164
|
+
▼ ▼
|
|
165
|
+
Works for 80% Works for 19% Works for 1%
|
|
166
|
+
of websites (JS-heavy sites) (heavily protected)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
**Why this matters:**
|
|
170
|
+
- **Speed**: Don't waste 2 seconds rendering when 200ms will do
|
|
171
|
+
- **Cost**: Headless browsers burn CPU and memory
|
|
172
|
+
- **Reliability**: Auto-retry with browser if simple fetch fails
|
|
173
|
+
|
|
174
|
+
WebPeel automatically detects blocked requests (403, 503, Cloudflare challenges) and retries with browser mode. You get the best of both worlds.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## API Reference
|
|
179
|
+
|
|
180
|
+
### `peel(url, options?)`
|
|
181
|
+
|
|
182
|
+
Fetch and extract content from a URL.
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
interface PeelOptions {
|
|
186
|
+
render?: boolean; // Force browser mode (default: false)
|
|
187
|
+
wait?: number; // Wait time after page load in ms (default: 0)
|
|
188
|
+
format?: 'markdown' | 'text' | 'html'; // Output format (default: 'markdown')
|
|
189
|
+
timeout?: number; // Request timeout in ms (default: 30000)
|
|
190
|
+
userAgent?: string; // Custom user agent
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
interface PeelResult {
|
|
194
|
+
url: string; // Final URL (after redirects)
|
|
195
|
+
title: string; // Page title
|
|
196
|
+
content: string; // Page content in requested format
|
|
197
|
+
metadata: { // Extracted metadata
|
|
198
|
+
description?: string;
|
|
199
|
+
author?: string;
|
|
200
|
+
published?: string; // ISO 8601 date
|
|
201
|
+
image?: string; // Open Graph image
|
|
202
|
+
canonical?: string;
|
|
203
|
+
};
|
|
204
|
+
links: string[]; // All links on page (absolute URLs)
|
|
205
|
+
tokens: number; // Estimated token count
|
|
206
|
+
method: 'simple' | 'browser'; // Method used
|
|
207
|
+
elapsed: number; // Time taken (ms)
|
|
208
|
+
}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Error Types
|
|
212
|
+
|
|
213
|
+
```typescript
|
|
214
|
+
import { TimeoutError, BlockedError, NetworkError } from 'webpeel';
|
|
215
|
+
|
|
216
|
+
try {
|
|
217
|
+
const result = await peel('https://example.com');
|
|
218
|
+
} catch (error) {
|
|
219
|
+
if (error instanceof TimeoutError) {
|
|
220
|
+
// Request timed out
|
|
221
|
+
} else if (error instanceof BlockedError) {
|
|
222
|
+
// Site blocked the request (403, Cloudflare, etc.)
|
|
223
|
+
} else if (error instanceof NetworkError) {
|
|
224
|
+
// Network/DNS error
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### `cleanup()`
|
|
230
|
+
|
|
231
|
+
Clean up browser resources. Call this when you're done using WebPeel in your application:
|
|
232
|
+
|
|
233
|
+
```typescript
|
|
234
|
+
import { peel, cleanup } from 'webpeel';
|
|
235
|
+
|
|
236
|
+
// ... use peel() ...
|
|
237
|
+
|
|
238
|
+
await cleanup(); // Close browser instances
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## Hosted API (Coming Soon)
|
|
244
|
+
|
|
245
|
+
Run WebPeel locally for free, or use our hosted API for scale:
|
|
246
|
+
|
|
247
|
+
| Plan | Price | Requests/Month | Features |
|
|
248
|
+
|------|------:|---------------:|----------|
|
|
249
|
+
| **Free** | $0 | Unlimited local | CLI, library, MCP server |
|
|
250
|
+
| **Hosted Free** | $0 | 1,000 | API access, no credit card |
|
|
251
|
+
| **Pro** | $9 | 50,000 | Priority queue, 99.9% SLA |
|
|
252
|
+
| **Scale** | $29 | 250,000 | Dedicated instances, webhook support |
|
|
253
|
+
|
|
254
|
+
**Compare:** Firecrawl Starter is $16/mo for 3,000 requests. Our Pro tier gives you 50,000 for $9/mo.
|
|
255
|
+
|
|
256
|
+
Join the waitlist at [webpeel.dev](https://webpeel.dev)
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Examples
|
|
261
|
+
|
|
262
|
+
### Extract blog post metadata
|
|
263
|
+
|
|
264
|
+
```typescript
|
|
265
|
+
const result = await peel('https://example.com/blog/post');
|
|
266
|
+
|
|
267
|
+
console.log(result.metadata);
|
|
268
|
+
// {
|
|
269
|
+
// title: "How We Built WebPeel",
|
|
270
|
+
// description: "A deep dive into smart escalation...",
|
|
271
|
+
// author: "Jake Liu",
|
|
272
|
+
// published: "2026-02-12T18:00:00Z",
|
|
273
|
+
// image: "https://example.com/og-image.png"
|
|
274
|
+
// }
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
### Get all links from a page
|
|
278
|
+
|
|
279
|
+
```typescript
|
|
280
|
+
const result = await peel('https://news.ycombinator.com');
|
|
281
|
+
|
|
282
|
+
console.log(result.links.slice(0, 5));
|
|
283
|
+
// [
|
|
284
|
+
// "https://news.ycombinator.com/newest",
|
|
285
|
+
// "https://news.ycombinator.com/submit",
|
|
286
|
+
// "https://github.com/example/repo",
|
|
287
|
+
// ...
|
|
288
|
+
// ]
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Force browser rendering for JavaScript-heavy sites
|
|
292
|
+
|
|
293
|
+
```typescript
|
|
294
|
+
// Twitter/X requires JavaScript
|
|
295
|
+
const result = await peel('https://x.com/elonmusk', {
|
|
296
|
+
render: true,
|
|
297
|
+
wait: 2000, // Wait for tweets to load
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
console.log(result.content); // Rendered tweet content
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Token counting for LLM usage
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
const result = await peel('https://example.com/long-article');
|
|
307
|
+
|
|
308
|
+
console.log(`Content is ~${result.tokens} tokens`);
|
|
309
|
+
// Content is ~3,247 tokens
|
|
310
|
+
|
|
311
|
+
if (result.tokens > 4000) {
|
|
312
|
+
console.log('Too long for GPT-3.5 context window');
|
|
313
|
+
}
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
---
|
|
317
|
+
|
|
318
|
+
## Use Cases
|
|
319
|
+
|
|
320
|
+
- **AI Agents**: Feed web content to Claude, GPT, or local LLMs
|
|
321
|
+
- **Research**: Bulk extract articles, docs, or social media
|
|
322
|
+
- **Monitoring**: Track content changes on websites
|
|
323
|
+
- **Archiving**: Save web pages as clean markdown
|
|
324
|
+
- **Data Pipelines**: Extract structured data from web sources
|
|
325
|
+
|
|
326
|
+
---
|
|
327
|
+
|
|
328
|
+
## Development
|
|
329
|
+
|
|
330
|
+
```bash
|
|
331
|
+
# Clone the repo
|
|
332
|
+
git clone https://github.com/JakeLiuMe/webpeel.git
|
|
333
|
+
cd webpeel
|
|
334
|
+
|
|
335
|
+
# Install dependencies
|
|
336
|
+
npm install
|
|
337
|
+
|
|
338
|
+
# Build
|
|
339
|
+
npm run build
|
|
340
|
+
|
|
341
|
+
# Run tests
|
|
342
|
+
npm test
|
|
343
|
+
|
|
344
|
+
# Watch mode (auto-rebuild)
|
|
345
|
+
npm run dev
|
|
346
|
+
|
|
347
|
+
# Test the CLI locally
|
|
348
|
+
node dist/cli.js https://example.com
|
|
349
|
+
|
|
350
|
+
# Test the MCP server
|
|
351
|
+
npm run mcp
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
355
|
+
|
|
356
|
+
---
|
|
357
|
+
|
|
358
|
+
## Roadmap
|
|
359
|
+
|
|
360
|
+
- [x] CLI with smart escalation
|
|
361
|
+
- [x] TypeScript library
|
|
362
|
+
- [x] MCP server for Claude/Cursor/VS Code
|
|
363
|
+
- [ ] Hosted API with authentication
|
|
364
|
+
- [ ] Rate limiting and caching
|
|
365
|
+
- [ ] Batch processing API
|
|
366
|
+
- [ ] Screenshot capture
|
|
367
|
+
- [ ] PDF extraction
|
|
368
|
+
- [ ] Webhook notifications for monitoring
|
|
369
|
+
|
|
370
|
+
Vote on features and roadmap at [GitHub Discussions](https://github.com/JakeLiuMe/webpeel/discussions).
|
|
371
|
+
|
|
372
|
+
---
|
|
373
|
+
|
|
374
|
+
## FAQ
|
|
375
|
+
|
|
376
|
+
**Q: How is this different from Firecrawl?**
|
|
377
|
+
A: WebPeel runs locally for free (Firecrawl is cloud-only). We also have smart escalation to avoid burning resources on simple pages.
|
|
378
|
+
|
|
379
|
+
**Q: Can I self-host the API server?**
|
|
380
|
+
A: Yes! Run `npm run serve` to start the API server. See [docs/self-hosting.md](docs/self-hosting.md) (coming soon).
|
|
381
|
+
|
|
382
|
+
**Q: Does this violate websites' Terms of Service?**
|
|
383
|
+
A: WebPeel respects `robots.txt` by default. Always check a site's ToS before scraping at scale.
|
|
384
|
+
|
|
385
|
+
**Q: What about CAPTCHA and Cloudflare?**
|
|
386
|
+
A: WebPeel handles most Cloudflare challenges automatically. For CAPTCHAs, you'll need a solving service (not included).
|
|
387
|
+
|
|
388
|
+
**Q: Can I use this in production?**
|
|
389
|
+
A: Yes, but be mindful of rate limits. The hosted API (coming soon) is better for high-volume production use.
|
|
390
|
+
|
|
391
|
+
---
|
|
392
|
+
|
|
393
|
+
## Credits
|
|
394
|
+
|
|
395
|
+
Built with:
|
|
396
|
+
- [Playwright](https://playwright.dev/) — Headless browser automation
|
|
397
|
+
- [Cheerio](https://cheerio.js.org/) — Fast HTML parsing
|
|
398
|
+
- [Turndown](https://github.com/mixmark-io/turndown) — HTML to Markdown conversion
|
|
399
|
+
- [Commander](https://github.com/tj/commander.js) — CLI framework
|
|
400
|
+
|
|
401
|
+
---
|
|
402
|
+
|
|
403
|
+
## Contributing
|
|
404
|
+
|
|
405
|
+
Contributions are welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
406
|
+
|
|
407
|
+
---
|
|
408
|
+
|
|
409
|
+
## License
|
|
410
|
+
|
|
411
|
+
MIT © [Jake Liu](https://github.com/JakeLiuMe)
|
|
412
|
+
|
|
413
|
+
---
|
|
414
|
+
|
|
415
|
+
**Like WebPeel?** [⭐ Star us on GitHub](https://github.com/JakeLiuMe/webpeel) — it helps others discover the project!
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* WebPeel CLI
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* npx webpeel <url> - Fetch and convert to markdown
|
|
7
|
+
* npx webpeel <url> --json - Output as JSON
|
|
8
|
+
* npx webpeel <url> --html - Output raw HTML
|
|
9
|
+
* npx webpeel <url> --render - Force browser mode
|
|
10
|
+
* npx webpeel <url> --wait 5000 - Wait 5s for JS to load
|
|
11
|
+
* npx webpeel search "query" - DuckDuckGo search
|
|
12
|
+
* npx webpeel serve - Start API server (future)
|
|
13
|
+
* npx webpeel mcp - Start MCP server (future)
|
|
14
|
+
*/
|
|
15
|
+
export {};
|
|
16
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;;;GAYG"}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* WebPeel CLI
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* npx webpeel <url> - Fetch and convert to markdown
|
|
7
|
+
* npx webpeel <url> --json - Output as JSON
|
|
8
|
+
* npx webpeel <url> --html - Output raw HTML
|
|
9
|
+
* npx webpeel <url> --render - Force browser mode
|
|
10
|
+
* npx webpeel <url> --wait 5000 - Wait 5s for JS to load
|
|
11
|
+
* npx webpeel search "query" - DuckDuckGo search
|
|
12
|
+
* npx webpeel serve - Start API server (future)
|
|
13
|
+
* npx webpeel mcp - Start MCP server (future)
|
|
14
|
+
*/
|
|
15
|
+
import { Command } from 'commander';
|
|
16
|
+
import ora from 'ora';
|
|
17
|
+
import { peel, cleanup } from './index.js';
|
|
18
|
+
const program = new Command();
|
|
19
|
+
program
|
|
20
|
+
.name('webpeel')
|
|
21
|
+
.description('Fast web fetcher for AI agents')
|
|
22
|
+
.version('0.1.0');
|
|
23
|
+
program
|
|
24
|
+
.argument('[url]', 'URL to fetch')
|
|
25
|
+
.option('-r, --render', 'Use headless browser (for JS-heavy sites)')
|
|
26
|
+
.option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
|
|
27
|
+
.option('--html', 'Output raw HTML instead of markdown')
|
|
28
|
+
.option('--text', 'Output plain text instead of markdown')
|
|
29
|
+
.option('--json', 'Output as JSON')
|
|
30
|
+
.option('-t, --timeout <ms>', 'Request timeout (ms)', parseInt, 30000)
|
|
31
|
+
.option('--ua <agent>', 'Custom user agent')
|
|
32
|
+
.option('-s, --silent', 'Silent mode (no spinner)')
|
|
33
|
+
.action(async (url, options) => {
|
|
34
|
+
if (!url) {
|
|
35
|
+
console.error('Error: URL is required\n');
|
|
36
|
+
program.help();
|
|
37
|
+
process.exit(1);
|
|
38
|
+
}
|
|
39
|
+
// SECURITY: Enhanced URL validation
|
|
40
|
+
if (url.length > 2048) {
|
|
41
|
+
console.error('Error: URL too long (max 2048 characters)');
|
|
42
|
+
process.exit(1);
|
|
43
|
+
}
|
|
44
|
+
// Check for control characters
|
|
45
|
+
if (/[\x00-\x1F\x7F]/.test(url)) {
|
|
46
|
+
console.error('Error: URL contains invalid control characters');
|
|
47
|
+
process.exit(1);
|
|
48
|
+
}
|
|
49
|
+
// Validate URL format
|
|
50
|
+
try {
|
|
51
|
+
const parsed = new URL(url);
|
|
52
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
53
|
+
console.error('Error: Only HTTP and HTTPS protocols are allowed');
|
|
54
|
+
process.exit(1);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
console.error(`Error: Invalid URL format: ${url}`);
|
|
59
|
+
process.exit(1);
|
|
60
|
+
}
|
|
61
|
+
const spinner = options.silent ? null : ora('Fetching...').start();
|
|
62
|
+
try {
|
|
63
|
+
// Validate options
|
|
64
|
+
if (options.wait && (options.wait < 0 || options.wait > 60000)) {
|
|
65
|
+
console.error('Error: Wait time must be between 0 and 60000ms');
|
|
66
|
+
process.exit(1);
|
|
67
|
+
}
|
|
68
|
+
// Build peel options
|
|
69
|
+
const peelOptions = {
|
|
70
|
+
render: options.render || false,
|
|
71
|
+
wait: options.wait || 0,
|
|
72
|
+
timeout: options.timeout,
|
|
73
|
+
userAgent: options.ua,
|
|
74
|
+
};
|
|
75
|
+
// Determine format
|
|
76
|
+
if (options.html) {
|
|
77
|
+
peelOptions.format = 'html';
|
|
78
|
+
}
|
|
79
|
+
else if (options.text) {
|
|
80
|
+
peelOptions.format = 'text';
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
peelOptions.format = 'markdown';
|
|
84
|
+
}
|
|
85
|
+
// Fetch the page
|
|
86
|
+
const result = await peel(url, peelOptions);
|
|
87
|
+
if (spinner) {
|
|
88
|
+
spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method`);
|
|
89
|
+
}
|
|
90
|
+
// Output results
|
|
91
|
+
if (options.json) {
|
|
92
|
+
console.log(JSON.stringify(result, null, 2));
|
|
93
|
+
}
|
|
94
|
+
else {
|
|
95
|
+
console.log(result.content);
|
|
96
|
+
}
|
|
97
|
+
// Clean up and exit
|
|
98
|
+
await cleanup();
|
|
99
|
+
process.exit(0);
|
|
100
|
+
}
|
|
101
|
+
catch (error) {
|
|
102
|
+
if (spinner) {
|
|
103
|
+
spinner.fail('Failed to fetch');
|
|
104
|
+
}
|
|
105
|
+
if (error instanceof Error) {
|
|
106
|
+
console.error(`\nError: ${error.message}`);
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
console.error('\nError: Unknown error occurred');
|
|
110
|
+
}
|
|
111
|
+
await cleanup();
|
|
112
|
+
process.exit(1);
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
// Future commands
|
|
116
|
+
program
|
|
117
|
+
.command('search')
|
|
118
|
+
.argument('<query>', 'Search query')
|
|
119
|
+
.description('Search using DuckDuckGo (future)')
|
|
120
|
+
.action(() => {
|
|
121
|
+
console.log('Search command not yet implemented');
|
|
122
|
+
console.log('Coming soon: DuckDuckGo search integration');
|
|
123
|
+
process.exit(1);
|
|
124
|
+
});
|
|
125
|
+
program
|
|
126
|
+
.command('serve')
|
|
127
|
+
.description('Start API server')
|
|
128
|
+
.option('-p, --port <port>', 'Port number', '3000')
|
|
129
|
+
.action(async (options) => {
|
|
130
|
+
const { startServer } = await import('./server/app.js');
|
|
131
|
+
startServer({ port: parseInt(options.port, 10) });
|
|
132
|
+
});
|
|
133
|
+
program
|
|
134
|
+
.command('mcp')
|
|
135
|
+
.description('Start MCP server for Claude Desktop / Cursor')
|
|
136
|
+
.action(async () => {
|
|
137
|
+
await import('./mcp/server.js');
|
|
138
|
+
});
|
|
139
|
+
program.parse();
|
|
140
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,GAAG,MAAM,KAAK,CAAC;AACtB,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAG3C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,SAAS,CAAC;KACf,WAAW,CAAC,gCAAgC,CAAC;KAC7C,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,QAAQ,CAAC,OAAO,EAAE,cAAc,CAAC;KACjC,MAAM,CAAC,cAAc,EAAE,2CAA2C,CAAC;KACnE,MAAM,CAAC,iBAAiB,EAAE,gCAAgC,EAAE,QAAQ,CAAC;KACrE,MAAM,CAAC,QAAQ,EAAE,qCAAqC,CAAC;KACvD,MAAM,CAAC,QAAQ,EAAE,uCAAuC,CAAC;KACzD,MAAM,CAAC,QAAQ,EAAE,gBAAgB,CAAC;KAClC,MAAM,CAAC,oBAAoB,EAAE,sBAAsB,EAAE,QAAQ,EAAE,KAAK,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,mBAAmB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,KAAK,EAAE,GAAuB,EAAE,OAAO,EAAE,EAAE;IACjD,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC1C,OAAO,CAAC,IAAI,EAAE,CAAC;QACf,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,oCAAoC;IACpC,IAAI,GAAG,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC;QACtB,OAAO,CAAC,KAAK,CAAC,2CAA2C,CAAC,CAAC;QAC3D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,+BAA+B;IAC/B,IAAI,iBAAiB,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QAChC,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;QAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,sBAAsB;IACtB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,KAAK,CAAC,kDAAkD,CAAC,CAAC;YAClE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,KAAK,CAAC,8BAA8B,GAAG,EAAE,CAAC,CAAC;QACnD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,KAAK,EAAE,CAAC;IAEnE,IAAI,CAAC;QACH,mBAAmB;QACnB,IAAI,OAAO,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,OAAO,CAAC,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;YAC/D,OAAO,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;YAChE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,qBAAqB;QACrB,MAAM,WAAW,GAAgB;YAC/B,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;YAC/B,IAAI,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC;YACvB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,SAAS,EAAE,OAAO,CAAC,EAAE;SACtB,CAAC;QAEF,mBAAmB;QACnB,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC;QAC9B,CAAC;aAAM,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACxB,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC;QAC9B,CAAC;aAAM,CAAC;YACN,WAAW,CAAC,MAAM,GAAG,UAAU,CAAC;QAClC,CAAC;QAED,iBAAiB;QACjB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,WAAW,CAAC,CAAC;QAE5C,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,OAAO,CAAC,cAAc,MAAM,CAAC,OAAO,YAAY,MAAM,CAAC,MAAM,SAAS,CAAC,CAAC;QAClF,CAAC;QAED,iBAAiB;QACjB,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC9B,CAAC;QAED,oBAAoB;QACpB,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAClC,CAAC;QAED,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnD,CAAC;QAED,MAAM,OAAO,EAAE,CAAC;QAChB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,kBAAkB;AAClB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,QAAQ,CAAC,SAAS,EAAE,cAAc,CAAC;KACnC,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,GAAG,EAAE;IACX,OAAO,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;IAClD,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC,CAAC;IAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,kBAAkB,CAAC;KAC/B,MAAM,CAAC,mBAAmB,EAAE,aAAa,EAAE,MAAM,CAAC;KAClD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;IACxD,WAAW,CAAC,EAAE,IAAI,EAAE,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;AACpD,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;AAClC,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core fetching logic: simple HTTP and browser-based fetching
|
|
3
|
+
*/
|
|
4
|
+
export interface FetchResult {
|
|
5
|
+
html: string;
|
|
6
|
+
url: string;
|
|
7
|
+
statusCode?: number;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Simple HTTP fetch using native fetch + Cheerio
|
|
11
|
+
* Fast and lightweight, but can be blocked by Cloudflare/bot detection
|
|
12
|
+
* SECURITY: Manual redirect handling with SSRF re-validation
|
|
13
|
+
*/
|
|
14
|
+
export declare function simpleFetch(url: string, userAgent?: string, timeoutMs?: number): Promise<FetchResult>;
|
|
15
|
+
/**
|
|
16
|
+
* Fetch using headless Chromium via Playwright
|
|
17
|
+
* Slower but can handle JavaScript-heavy sites and bypass some bot detection
|
|
18
|
+
*/
|
|
19
|
+
export declare function browserFetch(url: string, options?: {
|
|
20
|
+
userAgent?: string;
|
|
21
|
+
waitMs?: number;
|
|
22
|
+
timeoutMs?: number;
|
|
23
|
+
}): Promise<FetchResult>;
|
|
24
|
+
/**
|
|
25
|
+
* Retry a fetch operation with exponential backoff
|
|
26
|
+
*/
|
|
27
|
+
export declare function retryFetch<T>(fn: () => Promise<T>, maxAttempts?: number, baseDelayMs?: number): Promise<T>;
|
|
28
|
+
/**
|
|
29
|
+
* Clean up browser resources
|
|
30
|
+
*/
|
|
31
|
+
export declare function cleanup(): Promise<void>;
|
|
32
|
+
//# sourceMappingURL=fetcher.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AAAA;;GAEG;AA2PH,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;;;GAIG;AACH,wBAAsB,WAAW,CAC/B,GAAG,EAAE,MAAM,EACX,SAAS,CAAC,EAAE,MAAM,EAClB,SAAS,GAAE,MAAc,GACxB,OAAO,CAAC,WAAW,CAAC,CAyItB;AAuBD;;;GAGG;AACH,wBAAsB,YAAY,CAChC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IACP,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;CACf,GACL,OAAO,CAAC,WAAW,CAAC,CAoGtB;AAED;;GAEG;AACH,wBAAsB,UAAU,CAAC,CAAC,EAChC,EAAE,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,EACpB,WAAW,GAAE,MAAU,EACvB,WAAW,GAAE,MAAa,GACzB,OAAO,CAAC,CAAC,CAAC,CAsBZ;AAED;;GAEG;AACH,wBAAsB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAK7C"}
|