webpeel 0.21.86 → 0.21.88
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -279
- package/dist/cli/commands/fetch.js +28 -1
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/search.js +15 -2
- package/dist/cli/utils.js +10 -1
- package/dist/cli.js +2 -0
- package/dist/core/browser-fetch.d.ts +2 -0
- package/dist/core/browser-fetch.js +24 -7
- package/dist/core/content-pruner.js +3 -0
- package/dist/core/crawler.d.ts +2 -0
- package/dist/core/crawler.js +3 -1
- package/dist/core/http-fetch.js +19 -2
- package/dist/core/markdown.js +38 -5
- package/dist/core/metadata.d.ts +7 -0
- package/dist/core/metadata.js +27 -1
- package/dist/core/pipeline.js +54 -25
- package/dist/core/readability.js +2 -1
- package/dist/core/schema-templates.js +37 -24
- package/dist/core/search-provider.d.ts +2 -0
- package/dist/core/search-provider.js +9 -2
- package/dist/core/searxng-provider.d.ts +1 -0
- package/dist/core/searxng-provider.js +1 -0
- package/dist/core/strategies.d.ts +4 -1
- package/dist/core/strategies.js +17 -3
- package/dist/core/watch-manager.d.ts +5 -1
- package/dist/core/watch-manager.js +39 -12
- package/dist/core/watch.d.ts +2 -0
- package/dist/core/watch.js +31 -9
- package/dist/ee/domain-extractors.d.ts +4 -44
- package/dist/ee/domain-extractors.js +4 -6338
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +255 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +115 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +162 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +308 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +176 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/utils.d.ts +12 -0
- package/dist/ee/extractors/utils.js +84 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/mcp/handlers/definitions.js +37 -16
- package/dist/server/app.js +33 -0
- package/dist/server/bull-queues.d.ts +1 -0
- package/dist/server/email-service.d.ts +30 -0
- package/dist/server/email-service.js +86 -0
- package/dist/server/middleware/audit-log.d.ts +12 -0
- package/dist/server/middleware/audit-log.js +40 -0
- package/dist/server/pg-auth-store.d.ts +7 -0
- package/dist/server/pg-auth-store.js +39 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.js +1 -0
- package/dist/server/routes/fetch.js +153 -17
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/jobs.js +1 -0
- package/dist/server/routes/mcp.js +62 -6
- package/dist/server/routes/screenshot.js +3 -0
- package/dist/server/routes/search.js +5 -0
- package/dist/server/routes/smart-search.d.ts +5 -3
- package/dist/server/routes/smart-search.js +1848 -141
- package/dist/server/routes/users.js +120 -0
- package/dist/types.d.ts +6 -0
- package/package.json +29 -4
package/README.md
CHANGED
|
@@ -5,248 +5,61 @@
|
|
|
5
5
|
</p>
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
|
-
<a href="https://github.com/webpeel/webpeel/actions/workflows/ci.yml"><img src="https://github.com/webpeel/webpeel/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
9
8
|
<a href="https://www.npmjs.com/package/webpeel"><img src="https://img.shields.io/npm/v/webpeel.svg?style=flat-square" alt="npm version"></a>
|
|
10
|
-
<a href="https://
|
|
9
|
+
<a href="https://www.npmjs.com/package/webpeel"><img src="https://img.shields.io/npm/dm/webpeel.svg?style=flat-square" alt="npm downloads"></a>
|
|
10
|
+
<a href="https://github.com/webpeel/webpeel/stargazers"><img src="https://img.shields.io/github/stars/webpeel/webpeel?style=flat-square" alt="GitHub stars"></a>
|
|
11
11
|
<a href="LICENSE"><img src="https://img.shields.io/badge/license-WebPeel%20SDK-blue.svg?style=flat-square" alt="License"></a>
|
|
12
|
-
<a href="https://webpeel.
|
|
13
|
-
</p>
|
|
14
|
-
|
|
15
|
-
<p align="center">
|
|
16
|
-
<strong>The web data API for AI agents.</strong><br>
|
|
17
|
-
Fetch, search, extract, and understand any webpage — with one API call.
|
|
12
|
+
<a href="https://github.com/webpeel/webpeel/actions/workflows/ci.yml"><img src="https://github.com/webpeel/webpeel/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
18
13
|
</p>
|
|
19
14
|
|
|
20
15
|
<p align="center">
|
|
21
|
-
<
|
|
22
|
-
<a href="https://app.webpeel.dev">Dashboard</a> ·
|
|
23
|
-
<a href="https://webpeel.dev/docs/api">API Reference</a> ·
|
|
24
|
-
<a href="https://discord.gg/webpeel">Discord</a> ·
|
|
25
|
-
<a href="https://webpeel.dev/status">Status</a>
|
|
16
|
+
<strong>The web data platform for AI agents — fetch, search, crawl, extract, monitor, screenshot, and research any URL.</strong>
|
|
26
17
|
</p>
|
|
27
18
|
|
|
28
19
|
---
|
|
29
20
|
|
|
30
|
-
##
|
|
31
|
-
|
|
32
|
-
### Install
|
|
33
|
-
|
|
34
|
-
```bash
|
|
35
|
-
# Node.js / TypeScript
|
|
36
|
-
npm install webpeel
|
|
37
|
-
|
|
38
|
-
# Python
|
|
39
|
-
pip install webpeel
|
|
40
|
-
|
|
41
|
-
# No install — use directly
|
|
42
|
-
npx webpeel "https://example.com"
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### Usage
|
|
46
|
-
|
|
47
|
-
**TypeScript**
|
|
48
|
-
```typescript
|
|
49
|
-
import { WebPeel } from 'webpeel';
|
|
50
|
-
|
|
51
|
-
const wp = new WebPeel({ apiKey: process.env.WEBPEEL_API_KEY });
|
|
52
|
-
const result = await wp.fetch('https://news.ycombinator.com');
|
|
53
|
-
console.log(result.markdown); // Clean, structured content
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
**Python**
|
|
57
|
-
```python
|
|
58
|
-
from webpeel import WebPeel
|
|
21
|
+
## Quick Start
|
|
59
22
|
|
|
60
|
-
wp = WebPeel(api_key=os.environ["WEBPEEL_API_KEY"])
|
|
61
|
-
result = wp.fetch("https://news.ycombinator.com")
|
|
62
|
-
print(result.markdown) # Clean, structured content
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
**curl**
|
|
66
23
|
```bash
|
|
67
|
-
|
|
68
|
-
|
|
24
|
+
npx webpeel "https://example.com" # Clean markdown
|
|
25
|
+
npx webpeel search "AI trends 2025" # Web search
|
|
26
|
+
npx webpeel crawl docs.example.com # Crawl entire site
|
|
69
27
|
```
|
|
70
28
|
|
|
71
29
|
[Get your free API key →](https://app.webpeel.dev/signup) · No credit card required · 500 requests/week free
|
|
72
30
|
|
|
73
31
|
---
|
|
74
32
|
|
|
75
|
-
##
|
|
76
|
-
|
|
77
|
-
| | Capability | Result |
|
|
78
|
-
|---|---|---|
|
|
79
|
-
| 🌐 | **Fetch** | Any URL → clean markdown or JSON. Handles JavaScript, bot detection, and dynamic content automatically |
|
|
80
|
-
| 🔍 | **Search** | Web search with structured results — titles, URLs, snippets, and optional full-page content |
|
|
81
|
-
| 📊 | **Extract** | Pull structured data using JSON Schema. Products, pricing, contacts, tables — any pattern |
|
|
82
|
-
| 🕷️ | **Crawl** | Map and scrape entire websites with one API call. Follows links, respects robots.txt |
|
|
83
|
-
| 🤖 | **MCP** | 7 tools natively available in Claude, Cursor, VS Code, Windsurf, and any MCP-compatible agent |
|
|
84
|
-
| 📸 | **Screenshot** | Full-page or viewport screenshots in PNG/JPEG |
|
|
85
|
-
| 🎬 | **YouTube** | Video transcripts with timestamps — no YouTube API key required |
|
|
86
|
-
| 👁️ | **Monitor** | Watch pages for changes and receive webhook notifications |
|
|
87
|
-
|
|
88
|
-
---
|
|
89
|
-
|
|
90
|
-
## Anti-Bot Bypass Stack
|
|
91
|
-
|
|
92
|
-
WebPeel uses a 4-layer escalation chain to bypass bot protection — all built in-house, no paid proxy services required:
|
|
93
|
-
|
|
94
|
-
```
|
|
95
|
-
1. PeelTLS — Chrome TLS fingerprint spoofing (in-process Go binary) ~85% of sites
|
|
96
|
-
2. CF Worker — Cloudflare edge network proxy (different IP reputation) +5%
|
|
97
|
-
3. Google Cache — Cached page copy if available +2%
|
|
98
|
-
4. Search — Extract from search engine snippets (last resort) last resort
|
|
99
|
-
```
|
|
100
|
-
|
|
101
|
-
**For e-commerce sites**, WebPeel uses official APIs before attempting HTML scraping:
|
|
102
|
-
- **Best Buy** — Free Products API (50K queries/day). Set `BESTBUY_API_KEY` env var.
|
|
103
|
-
- **Walmart** — Frontend API (may be blocked; falls through gracefully)
|
|
104
|
-
- **Reddit, GitHub, HN, Wikipedia, YouTube, ArXiv** — Official APIs, always fast
|
|
105
|
-
|
|
106
|
-
**Self-hosted CF Worker** (100K requests/day free):
|
|
107
|
-
```bash
|
|
108
|
-
cd worker && npx wrangler deploy
|
|
109
|
-
# Then set WEBPEEL_CF_WORKER_URL and WEBPEEL_CF_WORKER_TOKEN env vars
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
---
|
|
113
|
-
|
|
114
|
-
## Benchmarks
|
|
115
|
-
|
|
116
|
-
Independent testing across 500 URLs including e-commerce, news, SaaS, and social platforms.
|
|
117
|
-
|
|
118
|
-
| Metric | **WebPeel** | Firecrawl | Crawl4AI | Jina Reader |
|
|
119
|
-
|--------|:-----------:|:---------:|:--------:|:-----------:|
|
|
120
|
-
| Success rate (protected sites) | **97.6%** | 71% | 58% | 49% |
|
|
121
|
-
| Median response time | **380ms** | 890ms | 1,240ms | 520ms |
|
|
122
|
-
| Content quality score¹ | **0.91** | 0.74 | 0.69 | 0.72 |
|
|
123
|
-
| Price per 1,000 requests | **$0.80** | $5.33 | self-host | $1.00 |
|
|
33
|
+
## Why WebPeel
|
|
124
34
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
35
|
+
- **65–98% token savings** — domain-specific extractors strip boilerplate, ads, and nav before content reaches your agent
|
|
36
|
+
- **29 domain extractors** — purpose-built parsers for Reddit, Wikipedia, GitHub, Hacker News, YouTube, ArXiv, Amazon, and 22 more
|
|
37
|
+
- **Zero-config Cloudflare bypass** — 4-layer escalation stack handles TLS fingerprinting, edge proxying, and cache fallback automatically
|
|
128
38
|
|
|
129
39
|
---
|
|
130
40
|
|
|
131
|
-
##
|
|
132
|
-
|
|
133
|
-
|
|
|
134
|
-
|
|
135
|
-
|
|
|
136
|
-
|
|
|
137
|
-
|
|
|
138
|
-
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
| Feature | Command / API |
|
|
44
|
+
|---------|---------------|
|
|
45
|
+
| Fetch any URL | `webpeel "url"` |
|
|
46
|
+
| Web search | `webpeel search "query"` |
|
|
47
|
+
| Crawl sites | `webpeel crawl "url" --max-pages 50` |
|
|
48
|
+
| Screenshots | `webpeel screenshot "url"` |
|
|
49
|
+
| Monitor changes | `webpeel monitor "url" --interval 300` |
|
|
50
|
+
| Browser actions | `--action 'click:.btn,wait:2000'` |
|
|
51
|
+
| YouTube transcripts | auto-detected |
|
|
52
|
+
| PDF extraction | auto-detected |
|
|
53
|
+
| MCP server | `webpeel mcp` |
|
|
54
|
+
| Schema extraction | `POST /v1/fetch` with `extract.schema` |
|
|
55
|
+
| Research agent | `POST /v1/agent` |
|
|
56
|
+
| Smart search | `POST /v1/search/smart` |
|
|
142
57
|
|
|
143
58
|
---
|
|
144
59
|
|
|
145
|
-
##
|
|
146
|
-
|
|
147
|
-
### TypeScript / Node.js
|
|
148
|
-
|
|
149
|
-
```typescript
|
|
150
|
-
import { WebPeel } from 'webpeel';
|
|
151
|
-
|
|
152
|
-
const wp = new WebPeel({ apiKey: process.env.WEBPEEL_API_KEY });
|
|
153
|
-
|
|
154
|
-
// Fetch a page
|
|
155
|
-
const page = await wp.fetch('https://stripe.com/pricing', {
|
|
156
|
-
format: 'markdown', // 'markdown' | 'html' | 'text' | 'json'
|
|
157
|
-
});
|
|
158
|
-
|
|
159
|
-
// Search the web
|
|
160
|
-
const results = await wp.search('best vector databases 2025', {
|
|
161
|
-
limit: 5,
|
|
162
|
-
fetchContent: true, // Optionally fetch full content for each result
|
|
163
|
-
});
|
|
164
|
-
|
|
165
|
-
// Extract structured data
|
|
166
|
-
const pricing = await wp.extract('https://stripe.com/pricing', {
|
|
167
|
-
schema: {
|
|
168
|
-
type: 'object',
|
|
169
|
-
properties: {
|
|
170
|
-
plans: {
|
|
171
|
-
type: 'array',
|
|
172
|
-
items: { type: 'object', properties: {
|
|
173
|
-
name: { type: 'string' },
|
|
174
|
-
price: { type: 'string' },
|
|
175
|
-
features: { type: 'array', items: { type: 'string' } }
|
|
176
|
-
}}
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
});
|
|
181
|
-
|
|
182
|
-
// Crawl a site
|
|
183
|
-
const crawl = await wp.crawl('https://docs.example.com', {
|
|
184
|
-
maxPages: 50,
|
|
185
|
-
maxDepth: 3,
|
|
186
|
-
outputFormat: 'markdown',
|
|
187
|
-
});
|
|
188
|
-
for await (const page of crawl) {
|
|
189
|
-
console.log(page.url, page.markdown);
|
|
190
|
-
}
|
|
60
|
+
## MCP Integration
|
|
191
61
|
|
|
192
|
-
|
|
193
|
-
const shot = await wp.screenshot('https://webpeel.dev', { fullPage: true });
|
|
194
|
-
fs.writeFileSync('screenshot.png', shot.image, 'base64');
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
[Full TypeScript reference →](https://webpeel.dev/docs/sdk/typescript)
|
|
198
|
-
|
|
199
|
-
### Python
|
|
200
|
-
|
|
201
|
-
```python
|
|
202
|
-
from webpeel import WebPeel
|
|
203
|
-
import os
|
|
204
|
-
|
|
205
|
-
wp = WebPeel(api_key=os.environ["WEBPEEL_API_KEY"])
|
|
206
|
-
|
|
207
|
-
# Fetch a page
|
|
208
|
-
page = wp.fetch("https://stripe.com/pricing", format="markdown")
|
|
209
|
-
print(page.markdown)
|
|
210
|
-
|
|
211
|
-
# Search
|
|
212
|
-
results = wp.search("best vector databases 2025", limit=5)
|
|
213
|
-
for r in results:
|
|
214
|
-
print(r.title, r.url)
|
|
215
|
-
|
|
216
|
-
# Extract structured data
|
|
217
|
-
pricing = wp.extract("https://stripe.com/pricing", schema={
|
|
218
|
-
"type": "object",
|
|
219
|
-
"properties": {
|
|
220
|
-
"plans": {
|
|
221
|
-
"type": "array",
|
|
222
|
-
"items": { "type": "object", "properties": {
|
|
223
|
-
"name": { "type": "string" },
|
|
224
|
-
"price": { "type": "string" }
|
|
225
|
-
}}
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
})
|
|
229
|
-
|
|
230
|
-
# Async client
|
|
231
|
-
from webpeel import AsyncWebPeel
|
|
232
|
-
import asyncio
|
|
233
|
-
|
|
234
|
-
async def main():
|
|
235
|
-
wp = AsyncWebPeel(api_key=os.environ["WEBPEEL_API_KEY"])
|
|
236
|
-
results = await asyncio.gather(
|
|
237
|
-
wp.fetch("https://site1.com"),
|
|
238
|
-
wp.fetch("https://site2.com"),
|
|
239
|
-
wp.fetch("https://site3.com"),
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
asyncio.run(main())
|
|
243
|
-
```
|
|
244
|
-
|
|
245
|
-
[Full Python reference →](https://webpeel.dev/docs/sdk/python)
|
|
246
|
-
|
|
247
|
-
### MCP — For AI Agents
|
|
248
|
-
|
|
249
|
-
Give Claude, Cursor, or any MCP-compatible agent the ability to browse the web.
|
|
62
|
+
Give Claude, Cursor, or any MCP-compatible agent the ability to browse the web in one config change.
|
|
250
63
|
|
|
251
64
|
**Claude Desktop** (`~/.claude/claude_desktop_config.json`):
|
|
252
65
|
```json
|
|
@@ -278,96 +91,93 @@ Give Claude, Cursor, or any MCP-compatible agent the ability to browse the web.
|
|
|
278
91
|
}
|
|
279
92
|
```
|
|
280
93
|
|
|
281
|
-
Available MCP tools:
|
|
282
|
-
- `webpeel` — general fetch and extract
|
|
283
|
-
- `webpeel_read` — fetch and read page content
|
|
284
|
-
- `webpeel_see` — screenshot and visual analysis
|
|
285
|
-
- `webpeel_find` — web search
|
|
286
|
-
- `webpeel_extract` — structured data extraction
|
|
287
|
-
- `webpeel_monitor` — watch URLs for changes
|
|
288
|
-
- `webpeel_act` — interact with dynamic pages
|
|
94
|
+
Available MCP tools: `webpeel_read`, `webpeel_find`, `webpeel_see`, `webpeel_extract`, `webpeel_monitor`, `webpeel_act`, `webpeel_crawl`
|
|
289
95
|
|
|
290
|
-
[
|
|
291
|
-
[](https://mcp.so/install/webpeel?for=vscode)
|
|
96
|
+
[Full MCP setup guide →](https://webpeel.dev/docs/mcp)
|
|
292
97
|
|
|
293
|
-
|
|
98
|
+
---
|
|
294
99
|
|
|
295
|
-
|
|
100
|
+
## API Example
|
|
296
101
|
|
|
297
102
|
```bash
|
|
298
|
-
#
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
webpeel "https://news.ycombinator.com"
|
|
103
|
+
# Fetch any page — returns clean markdown + metadata
|
|
104
|
+
curl "https://api.webpeel.dev/v1/fetch?url=https://stripe.com/pricing" \
|
|
105
|
+
-H "Authorization: Bearer $WEBPEEL_API_KEY"
|
|
106
|
+
```
|
|
303
107
|
|
|
304
|
-
|
|
305
|
-
|
|
108
|
+
```json
|
|
109
|
+
{
|
|
110
|
+
"url": "https://stripe.com/pricing",
|
|
111
|
+
"markdown": "# Stripe Pricing\n\n**Integrated per-transaction fees**...",
|
|
112
|
+
"metadata": {
|
|
113
|
+
"title": "Pricing & Fees | Stripe",
|
|
114
|
+
"tokens": 420,
|
|
115
|
+
"tokensOriginal": 8200,
|
|
116
|
+
"savingsPct": 94.9
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
```
|
|
306
120
|
|
|
307
|
-
|
|
308
|
-
webpeel "https://stripe.com/pricing" --extract-schema pricing-schema.json
|
|
121
|
+
[Full API reference →](https://webpeel.dev/docs/api)
|
|
309
122
|
|
|
310
|
-
|
|
311
|
-
webpeel crawl "https://docs.example.com" --max-pages 100
|
|
123
|
+
---
|
|
312
124
|
|
|
313
|
-
|
|
314
|
-
webpeel screenshot "https://webpeel.dev" --full-page --output screenshot.png
|
|
125
|
+
## Token Efficiency
|
|
315
126
|
|
|
316
|
-
|
|
317
|
-
webpeel "https://youtube.com/watch?v=dQw4w9WgXcQ" --json
|
|
127
|
+
WebPeel's 29 domain-specific extractors strip navigation, ads, sidebars, and boilerplate before sending content to your agent.
|
|
318
128
|
|
|
319
|
-
|
|
320
|
-
|
|
129
|
+
| Site type | Raw HTML tokens | WebPeel tokens | Savings |
|
|
130
|
+
|-----------|:--------------:|:--------------:|:-------:|
|
|
131
|
+
| News article | 18,000 | 640 | **96%** |
|
|
132
|
+
| Reddit thread | 24,000 | 890 | **96%** |
|
|
133
|
+
| Wikipedia page | 31,000 | 2,100 | **93%** |
|
|
134
|
+
| GitHub README | 5,200 | 1,800 | **65%** |
|
|
135
|
+
| E-commerce product | 14,000 | 310 | **98%** |
|
|
321
136
|
|
|
322
|
-
|
|
323
|
-
webpeel "https://example.com" --json
|
|
324
|
-
```
|
|
137
|
+
Less context used = lower costs + faster inference + longer agent chains.
|
|
325
138
|
|
|
326
139
|
---
|
|
327
140
|
|
|
328
|
-
##
|
|
141
|
+
## Security
|
|
329
142
|
|
|
330
|
-
|
|
143
|
+
WebPeel is built with security-first principles:
|
|
331
144
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
145
|
+
- **Helmet.js headers** — HSTS, X-Frame-Options, nosniff, XSS protection on all responses
|
|
146
|
+
- **Webhook signing** — HMAC-SHA256 signatures on all outbound webhooks
|
|
147
|
+
- **Audit logging** — every API call logged with IP, key, and action
|
|
148
|
+
- **GDPR compliant** — `DELETE /v1/account` for full data erasure
|
|
149
|
+
- **SSH hardened** — Fail2Ban, MaxAuthTries, key-only auth on all infrastructure
|
|
335
150
|
|
|
336
|
-
|
|
337
|
-
GET /search?q=<query>&limit=10
|
|
151
|
+
[Security policy →](https://webpeel.dev/security)
|
|
338
152
|
|
|
339
|
-
|
|
340
|
-
POST /extract
|
|
341
|
-
{ "url": "...", "schema": { ... } }
|
|
153
|
+
---
|
|
342
154
|
|
|
343
|
-
|
|
344
|
-
POST /crawl
|
|
345
|
-
{ "url": "...", "maxPages": 50, "maxDepth": 3 }
|
|
155
|
+
## Links
|
|
346
156
|
|
|
347
|
-
|
|
348
|
-
|
|
157
|
+
- 📖 [Documentation](https://webpeel.dev/docs) — Guides, references, and examples
|
|
158
|
+
- 💰 [Pricing](https://webpeel.dev/pricing) — Plans and limits
|
|
159
|
+
- 📝 [Blog](https://webpeel.dev/blog) — Tutorials, comparisons, and use cases
|
|
160
|
+
- 📊 [Status](https://webpeel.dev/status) — Uptime and incidents
|
|
161
|
+
- 🔒 [Security](https://webpeel.dev/security) — Security policy and disclosure
|
|
162
|
+
- 📋 [SLA](https://webpeel.dev/sla) — Uptime commitments
|
|
349
163
|
|
|
350
|
-
|
|
351
|
-
GET /youtube?url=<youtube_url>
|
|
352
|
-
```
|
|
164
|
+
---
|
|
353
165
|
|
|
354
|
-
|
|
166
|
+
## Contributing
|
|
355
167
|
|
|
356
|
-
|
|
168
|
+
Pull requests welcome! Please open an issue first to discuss major changes.
|
|
357
169
|
|
|
358
|
-
|
|
170
|
+
1. Fork the repo
|
|
171
|
+
2. Create your feature branch (`git checkout -b feat/my-feature`)
|
|
172
|
+
3. Commit your changes (`git commit -m 'feat: add my feature'`)
|
|
173
|
+
4. Push to the branch (`git push origin feat/my-feature`)
|
|
174
|
+
5. Open a Pull Request
|
|
359
175
|
|
|
360
|
-
|
|
176
|
+
---
|
|
361
177
|
|
|
362
|
-
|
|
363
|
-
- 🚀 [Dashboard](https://app.webpeel.dev) — Manage your API keys and usage
|
|
364
|
-
- 🔌 [API Reference](https://webpeel.dev/docs/api) — Full endpoint documentation
|
|
365
|
-
- 💬 [Discord](https://discord.gg/webpeel) — Community and support
|
|
366
|
-
- 📊 [Status](https://webpeel.dev/status) — Uptime and incidents
|
|
367
|
-
- 💰 [Pricing](https://webpeel.dev/pricing) — Plans and limits
|
|
368
|
-
- 📈 [Benchmarks](https://webpeel.dev/benchmarks) — How we compare
|
|
178
|
+
## License
|
|
369
179
|
|
|
370
|
-
|
|
180
|
+
[WebPeel SDK License](LICENSE) — free for personal and commercial use with attribution. See LICENSE for full terms.
|
|
371
181
|
|
|
372
182
|
<p align="center">
|
|
373
183
|
<a href="https://app.webpeel.dev/signup">Get started free →</a>
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import ora from 'ora';
|
|
5
5
|
import { writeFileSync, readFileSync, existsSync } from 'fs';
|
|
6
6
|
import { getProfilePath, loadStorageState, touchProfile } from '../../core/profiles.js';
|
|
7
|
+
import { shouldForceBrowser } from '../../core/strategies.js';
|
|
7
8
|
import { peel, cleanup } from '../../index.js';
|
|
8
9
|
import { checkUsage, showUsageFooter, loadConfig } from '../../cli-auth.js';
|
|
9
10
|
import { getCache, setCache, parseTTL } from '../../cache.js';
|
|
@@ -289,6 +290,7 @@ export async function runFetch(url, options) {
|
|
|
289
290
|
format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
|
|
290
291
|
budget: null, // Budget excluded from cache key — cache stores full content
|
|
291
292
|
readable: options.readable || false,
|
|
293
|
+
noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
|
|
292
294
|
};
|
|
293
295
|
const cachedResult = getCache(url, cacheOptions);
|
|
294
296
|
if (cachedResult) {
|
|
@@ -603,6 +605,7 @@ export async function runFetch(url, options) {
|
|
|
603
605
|
headers,
|
|
604
606
|
cookies: options.cookie,
|
|
605
607
|
raw: options.raw || false,
|
|
608
|
+
noDomainApi: options.skipDomainApi || false,
|
|
606
609
|
lite: options.lite || false,
|
|
607
610
|
actions,
|
|
608
611
|
maxTokens: options.maxTokens,
|
|
@@ -626,6 +629,7 @@ export async function runFetch(url, options) {
|
|
|
626
629
|
device: options.device,
|
|
627
630
|
viewportWidth: options.viewport ? options.viewport.width : undefined,
|
|
628
631
|
viewportHeight: options.viewport ? options.viewport.height : undefined,
|
|
632
|
+
deviceScaleFactor: options.scale,
|
|
629
633
|
waitUntil: options.waitUntil,
|
|
630
634
|
waitSelector: options.waitSelector,
|
|
631
635
|
blockResources: options.blockResources ? options.blockResources.split(',').map((s) => s.trim()) : undefined,
|
|
@@ -676,8 +680,19 @@ export async function runFetch(url, options) {
|
|
|
676
680
|
const fetchCfg = loadConfig();
|
|
677
681
|
const fetchApiKey = fetchCfg.apiKey || process.env.WEBPEEL_API_KEY;
|
|
678
682
|
const fetchApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
|
|
683
|
+
// Features that require a local browser and cannot be delegated to the remote API.
|
|
684
|
+
// Also include domains (like amazon.com) that require stealth/browser rendering —
|
|
685
|
+
// the remote API won't render them correctly without special flags, so route locally.
|
|
686
|
+
const domainNeedsLocalBrowser = !!(shouldForceBrowser(url));
|
|
687
|
+
const needsLocalBrowser = !!(peelOptions.screenshot ||
|
|
688
|
+
peelOptions.actions?.length ||
|
|
689
|
+
peelOptions.profileDir ||
|
|
690
|
+
peelOptions.headed ||
|
|
691
|
+
peelOptions.storageState ||
|
|
692
|
+
peelOptions.cloaked ||
|
|
693
|
+
domainNeedsLocalBrowser);
|
|
679
694
|
let result;
|
|
680
|
-
if (fetchApiKey) {
|
|
695
|
+
if (fetchApiKey && !needsLocalBrowser) {
|
|
681
696
|
// Use the WebPeel API — no local Playwright needed
|
|
682
697
|
result = await fetchViaApi(url, peelOptions, fetchApiKey, fetchApiUrl);
|
|
683
698
|
}
|
|
@@ -724,6 +739,16 @@ export async function runFetch(url, options) {
|
|
|
724
739
|
? ` [${result.domainData.domain}:${result.domainData.type}]`
|
|
725
740
|
: '';
|
|
726
741
|
spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
|
|
742
|
+
// Smart hints — suggest features the user might not know about
|
|
743
|
+
if (!options.silent && !options.json && !options.skipDomainApi) {
|
|
744
|
+
if (result.method === 'domain-api') {
|
|
745
|
+
const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
|
|
746
|
+
console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
|
|
750
|
+
console.error(`\x1b[33m💡 Tip: Page returned very little content. Try --render for JavaScript-heavy sites or --stealth if blocked.\x1b[0m`);
|
|
751
|
+
}
|
|
727
752
|
}
|
|
728
753
|
// Show metadata header
|
|
729
754
|
const pageTitle = result.metadata?.title || result.title;
|
|
@@ -1176,6 +1201,7 @@ export function registerFetchCommands(program) {
|
|
|
1176
1201
|
.option('--images', 'Output image URLs from the page')
|
|
1177
1202
|
.option('--meta', 'Output only the page metadata (title, description, author, etc.)')
|
|
1178
1203
|
.option('--raw', 'Return full page without smart content extraction')
|
|
1204
|
+
.option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
|
|
1179
1205
|
.option('--full', 'Alias for --raw — full page content, no budget')
|
|
1180
1206
|
.option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
|
|
1181
1207
|
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
|
@@ -1208,6 +1234,7 @@ export function registerFetchCommands(program) {
|
|
|
1208
1234
|
const [w, h] = val.split('x').map(Number);
|
|
1209
1235
|
return { width: w, height: h };
|
|
1210
1236
|
})
|
|
1237
|
+
.option('--scale <factor>', 'Device scale factor (pixel density) for screenshots (default: auto from device profile)', parseFloat)
|
|
1211
1238
|
.option('--wait-until <event>', 'Page load event: domcontentloaded, networkidle, load, commit (auto-enables --render)')
|
|
1212
1239
|
.option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
|
|
1213
1240
|
.option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)')
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Monitor command: content change detection for URLs
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* webpeel monitor <url> - Fetch & snapshot (or diff if prev exists)
|
|
6
|
+
* webpeel monitor <url> --interval 300 - Watch mode: re-check every 5 minutes
|
|
7
|
+
* webpeel monitor <url> --json - JSON output for automation
|
|
8
|
+
* webpeel monitor <url> --render - Use browser rendering
|
|
9
|
+
* webpeel monitor <url> --selector <css> - Monitor specific section only
|
|
10
|
+
*/
|
|
11
|
+
import type { Command } from 'commander';
|
|
12
|
+
export declare function registerMonitorCommands(program: Command): void;
|