webpeel 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +11 -657
- package/README.md +246 -325
- package/dist/cli.js +330 -73
- package/dist/cli.js.map +1 -1
- package/dist/core/browser-fetch.d.ts +12 -0
- package/dist/core/browser-fetch.d.ts.map +1 -1
- package/dist/core/browser-fetch.js +70 -17
- package/dist/core/browser-fetch.js.map +1 -1
- package/dist/core/cf-worker-proxy.d.ts +33 -0
- package/dist/core/cf-worker-proxy.d.ts.map +1 -0
- package/dist/core/cf-worker-proxy.js +88 -0
- package/dist/core/cf-worker-proxy.js.map +1 -0
- package/dist/core/chunker.d.ts +47 -0
- package/dist/core/chunker.d.ts.map +1 -0
- package/dist/core/chunker.js +250 -0
- package/dist/core/chunker.js.map +1 -0
- package/dist/core/cloak-fetch.d.ts +43 -0
- package/dist/core/cloak-fetch.d.ts.map +1 -0
- package/dist/core/cloak-fetch.js +141 -0
- package/dist/core/cloak-fetch.js.map +1 -0
- package/dist/core/crawl-checkpoint.d.ts +55 -0
- package/dist/core/crawl-checkpoint.d.ts.map +1 -0
- package/dist/core/crawl-checkpoint.js +105 -0
- package/dist/core/crawl-checkpoint.js.map +1 -0
- package/dist/core/crawler.d.ts +5 -1
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +60 -5
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/cycle-fetch.d.ts +27 -0
- package/dist/core/cycle-fetch.d.ts.map +1 -0
- package/dist/core/cycle-fetch.js +99 -0
- package/dist/core/cycle-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts.map +1 -1
- package/dist/core/domain-extractors.js +754 -14
- package/dist/core/domain-extractors.js.map +1 -1
- package/dist/core/google-cache.d.ts +30 -0
- package/dist/core/google-cache.d.ts.map +1 -0
- package/dist/core/google-cache.js +181 -0
- package/dist/core/google-cache.js.map +1 -0
- package/dist/core/markdown.d.ts +11 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +43 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/peel-tls.d.ts +26 -0
- package/dist/core/peel-tls.d.ts.map +1 -0
- package/dist/core/peel-tls.js +221 -0
- package/dist/core/peel-tls.js.map +1 -0
- package/dist/core/pipeline.d.ts +5 -1
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +269 -21
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/schema-postprocess.d.ts +33 -0
- package/dist/core/schema-postprocess.d.ts.map +1 -0
- package/dist/core/schema-postprocess.js +470 -0
- package/dist/core/schema-postprocess.js.map +1 -0
- package/dist/core/schema-templates.d.ts +20 -0
- package/dist/core/schema-templates.d.ts.map +1 -0
- package/dist/core/schema-templates.js +131 -0
- package/dist/core/schema-templates.js.map +1 -0
- package/dist/core/search-fallback.d.ts +28 -0
- package/dist/core/search-fallback.d.ts.map +1 -0
- package/dist/core/search-fallback.js +185 -0
- package/dist/core/search-fallback.js.map +1 -0
- package/dist/core/search-provider.d.ts +47 -4
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +278 -7
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/stealth-patches.d.ts +58 -0
- package/dist/core/stealth-patches.d.ts.map +1 -0
- package/dist/core/stealth-patches.js +340 -0
- package/dist/core/stealth-patches.js.map +1 -0
- package/dist/core/strategies.d.ts +20 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +284 -48
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/core/strategy-hooks.d.ts.map +1 -1
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -15
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +109 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +29 -0
- package/dist/server/app.js.map +1 -1
- package/dist/server/middleware/rate-limit.d.ts +2 -1
- package/dist/server/middleware/rate-limit.d.ts.map +1 -1
- package/dist/server/middleware/rate-limit.js +24 -8
- package/dist/server/middleware/rate-limit.js.map +1 -1
- package/dist/server/routes/agent.d.ts +4 -0
- package/dist/server/routes/agent.d.ts.map +1 -1
- package/dist/server/routes/agent.js +196 -9
- package/dist/server/routes/agent.js.map +1 -1
- package/dist/server/routes/batch.js +5 -5
- package/dist/server/routes/batch.js.map +1 -1
- package/dist/server/routes/compat.d.ts.map +1 -1
- package/dist/server/routes/compat.js +1 -0
- package/dist/server/routes/compat.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +60 -6
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +103 -2
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/search.js +1 -1
- package/dist/server/routes/search.js.map +1 -1
- package/dist/types.d.ts +55 -4
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +4 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +55 -125
- package/package.json +15 -1
package/llms.txt
CHANGED
|
@@ -1,90 +1,60 @@
|
|
|
1
|
-
# WebPeel
|
|
1
|
+
# WebPeel — Fast web data for AI agents
|
|
2
2
|
|
|
3
|
-
>
|
|
4
|
-
|
|
5
|
-
WebPeel is an open-source web fetcher that converts any URL to clean, AI-ready markdown. Smart escalation tries fast HTTP first (~150ms), auto-escalates to headless browser when needed, and uses stealth mode for heavily protected sites. v0.14.0 adds YouTube transcript extraction, domain-aware extractors (Twitter/X, Reddit, GitHub, HN), LLM-free BM25 Q&A, reader mode, auto-extract, deep fetch intelligence, and URL monitoring.
|
|
3
|
+
> Open source web fetcher, scraper, and data extractor. One call to fetch any URL as clean markdown, extract structured data, or search the web.
|
|
6
4
|
|
|
7
5
|
## Quick Start
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
-
|
|
44
|
-
-
|
|
45
|
-
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
Authentication: `Authorization: Bearer YOUR_API_KEY` or anonymous (500 free fetches/week, no signup required).
|
|
62
|
-
|
|
63
|
-
## MCP Tools
|
|
64
|
-
|
|
65
|
-
- `webpeel_fetch` — Fetch a URL, return clean markdown. Params: url (required), render (boolean), wait (ms), format (markdown|text|html), actions (JSON array), readable (boolean)
|
|
66
|
-
- `webpeel_search` — Search the web. Params: query (required), count (1-10), provider (duckduckgo|brave)
|
|
67
|
-
- `webpeel_crawl` — Crawl a website. Params: url (required), maxPages (number), maxDepth (number)
|
|
68
|
-
- `webpeel_map` — Discover URLs on a domain. Params: url (required), maxUrls (number)
|
|
69
|
-
- `webpeel_extract` — Extract structured data. Params: url (required), schema (JSON Schema), prompt (string)
|
|
70
|
-
- `webpeel_batch` — Fetch multiple URLs. Params: urls (array), concurrency (number)
|
|
71
|
-
- `webpeel_screenshot` — Capture full-page or viewport screenshot. Params: url (required), fullPage (boolean), format (png|jpeg)
|
|
72
|
-
- `webpeel_summarize` — Fetch and return condensed content summary. Params: url (required), focus (string)
|
|
73
|
-
- `webpeel_answer` — Answer a question about any URL without BYOK. Params: url (required), question (required)
|
|
74
|
-
- `webpeel_brand` — Extract brand info (name, logo, colors, social links). Params: url (required)
|
|
75
|
-
- `webpeel_change_track` — Detect content changes (hash-based monitoring). Params: url (required), previousHash (string)
|
|
76
|
-
- `webpeel_deep_fetch` — Search + batch fetch + BM25 merge. Params: query (required), count (1-20). Comparison mode auto-detected.
|
|
77
|
-
- `webpeel_research` — Research agent. Params: prompt (required), urls (array), maxPages (number)
|
|
78
|
-
- `webpeel_youtube` — Extract YouTube video transcripts. Params: url (required). Supports all YouTube URL formats (watch, youtu.be, embed, shorts). No API key needed.
|
|
79
|
-
- `webpeel_auto_extract` — Heuristic structured data extraction — auto-detects page type (pricing, product, contact, article, api-docs). Params: url (required)
|
|
80
|
-
- `webpeel_quick_answer` — BM25-powered Q&A, no LLM key needed. Params: url (required), question (required)
|
|
81
|
-
- `webpeel_watch` — Persistent URL change monitoring with webhook notifications. Params: url (required), webhookUrl (required), schedule (cron string)
|
|
82
|
-
|
|
83
|
-
## MCP Configuration
|
|
84
|
-
|
|
85
|
-
Works with: Claude Desktop, Cursor, VS Code (Cline), Windsurf, Continue.dev, OpenClaw, and any MCP client.
|
|
86
|
-
|
|
87
|
-
```json
|
|
7
|
+
npm install webpeel
|
|
8
|
+
npx webpeel "https://example.com"
|
|
9
|
+
|
|
10
|
+
## API
|
|
11
|
+
|
|
12
|
+
POST https://api.webpeel.dev/v1/fetch
|
|
13
|
+
Content-Type: application/json
|
|
14
|
+
Authorization: Bearer wp_live_YOUR_KEY
|
|
15
|
+
|
|
16
|
+
{
|
|
17
|
+
"url": "https://example.com",
|
|
18
|
+
"format": "markdown"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
## CLI Commands
|
|
22
|
+
|
|
23
|
+
webpeel <url> Fetch URL as markdown
|
|
24
|
+
webpeel <url> --json Fetch as JSON with metadata
|
|
25
|
+
webpeel <url> --clean AI-optimized output (no URLs)
|
|
26
|
+
webpeel <url> -q "question?" Quick answer from page content
|
|
27
|
+
webpeel search "query" Web search
|
|
28
|
+
webpeel search "query" --site ebay Site-specific search
|
|
29
|
+
webpeel crawl <url> Crawl site pages
|
|
30
|
+
webpeel map <url> Discover site URLs
|
|
31
|
+
webpeel batch <file> Batch fetch URLs
|
|
32
|
+
webpeel pipe <url> Pipe-friendly JSON (no UI)
|
|
33
|
+
webpeel mcp Start MCP server
|
|
34
|
+
|
|
35
|
+
## Key Features
|
|
36
|
+
|
|
37
|
+
- Domain-first extraction: Reddit, GitHub, Wikipedia, Twitter/X, HN, YouTube, ArXiv, StackOverflow, NPM — instant via API, no browser
|
|
38
|
+
- Smart content extraction: BM25 scoring, budget distillation, JSON-LD parsing
|
|
39
|
+
- Quick answers: Ask questions about any page (no LLM needed)
|
|
40
|
+
- Anti-bot handling: Stealth mode, proxy rotation, graceful degradation
|
|
41
|
+
- Format options: markdown, text, html, clean (AI-optimized)
|
|
42
|
+
- MCP server: 18 tools for AI agent integration
|
|
43
|
+
- Site search: Search eBay, Amazon, GitHub, and 20+ sites with structured output
|
|
44
|
+
|
|
45
|
+
## Formats
|
|
46
|
+
|
|
47
|
+
| Format | Flag | Use Case |
|
|
48
|
+
|--------|------|----------|
|
|
49
|
+
| markdown | (default) | General use, documentation |
|
|
50
|
+
| text | --text | Plain text, no formatting |
|
|
51
|
+
| html | --html | Raw HTML preservation |
|
|
52
|
+
| clean | --clean | AI consumption (no URL noise) |
|
|
53
|
+
| json | --json | Programmatic access with metadata |
|
|
54
|
+
|
|
55
|
+
## MCP Integration
|
|
56
|
+
|
|
57
|
+
Add to your MCP config:
|
|
88
58
|
{
|
|
89
59
|
"mcpServers": {
|
|
90
60
|
"webpeel": {
|
|
@@ -93,50 +63,10 @@ Works with: Claude Desktop, Cursor, VS Code (Cline), Windsurf, Continue.dev, Ope
|
|
|
93
63
|
}
|
|
94
64
|
}
|
|
95
65
|
}
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
Hosted MCP (no local install): `https://api.webpeel.dev/mcp`
|
|
99
|
-
|
|
100
|
-
## Key Features
|
|
101
|
-
|
|
102
|
-
- **Smart escalation**: HTTP (~150ms) → Playwright browser (~2s) → Stealth mode (~5s) — only escalates when needed
|
|
103
|
-
- **Page actions**: Click, type, scroll, wait, press, select, hover before scraping
|
|
104
|
-
- **Screenshot API**: Full-page or viewport, PNG/JPEG, custom dimensions
|
|
105
|
-
- **PDF & DOCX parsing**: Feed a document URL, get clean markdown
|
|
106
|
-
- **Structured extraction**: Pass a JSON Schema + your LLM key (BYOK), get structured data
|
|
107
|
-
- **Branding extraction**: Extract colors, fonts, logos, and brand assets
|
|
108
|
-
- **Change tracking**: Monitor content changes over time with fingerprint diffing
|
|
109
|
-
- **Crawl & map**: Full site crawling with depth control, async jobs, webhooks
|
|
110
|
-
- **Web search**: DuckDuckGo (free, no key) or Brave Search (BYOK)
|
|
111
|
-
- **Answer endpoint**: Search + fetch + LLM-generated answer with citations
|
|
112
|
-
- **Research agent**: Autonomous multi-page research with streaming
|
|
113
|
-
- **Firecrawl-compatible**: Drop-in replacement — change one URL, your code works
|
|
114
|
-
- **Anti-bot bypass**: Cloudflare, DataDome, JavaScript walls, 403s
|
|
115
|
-
- **Token-optimized**: Strips navigation, ads, scripts, cookie banners
|
|
116
|
-
- **SSRF protection**: Blocks private IPs, IPv6 mapped addresses, redirect attacks
|
|
117
|
-
- **Open source**: AGPL-3.0 licensed, fully self-hostable
|
|
118
|
-
|
|
119
|
-
## Pricing
|
|
120
|
-
|
|
121
|
-
- **Free**: 500 fetches/week, 50/hr burst — no credit card required
|
|
122
|
-
- **Pro**: $9/mo — 1,250/week, 100/hr burst
|
|
123
|
-
- **Max**: $29/mo — 6,250/week, 500/hr burst
|
|
124
|
-
- All features on all plans (no feature-gating)
|
|
125
|
-
- Extra usage: Basic $0.002, Stealth $0.01, Search $0.001 per credit
|
|
126
|
-
|
|
127
|
-
## SDKs & Integrations
|
|
128
|
-
|
|
129
|
-
- **CLI**: `npm install -g webpeel`
|
|
130
|
-
- **Python SDK**: `pip install webpeel` (zero deps)
|
|
131
|
-
- **TypeScript/Node.js**: `npm install webpeel`
|
|
132
|
-
- **LangChain**: WebPeelLoader integration
|
|
133
|
-
- **LlamaIndex**: WebPeelReader integration
|
|
134
66
|
|
|
135
67
|
## Links
|
|
136
68
|
|
|
137
|
-
-
|
|
138
|
-
- API
|
|
69
|
+
- Docs: https://webpeel.dev/docs
|
|
70
|
+
- API: https://api.webpeel.dev
|
|
139
71
|
- GitHub: https://github.com/webpeel/webpeel
|
|
140
72
|
- npm: https://www.npmjs.com/package/webpeel
|
|
141
|
-
- Status: https://webpeel.dev/status
|
|
142
|
-
- Changelog: https://webpeel.dev/changelog
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.17.0",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|
|
@@ -14,6 +14,18 @@
|
|
|
14
14
|
".": {
|
|
15
15
|
"import": "./dist/index.js",
|
|
16
16
|
"types": "./dist/index.d.ts"
|
|
17
|
+
},
|
|
18
|
+
"./integrations/langchain": {
|
|
19
|
+
"import": "./dist/integrations/langchain.js",
|
|
20
|
+
"types": "./dist/integrations/langchain.d.ts"
|
|
21
|
+
},
|
|
22
|
+
"./integrations/llamaindex": {
|
|
23
|
+
"import": "./dist/integrations/llamaindex.js",
|
|
24
|
+
"types": "./dist/integrations/llamaindex.d.ts"
|
|
25
|
+
},
|
|
26
|
+
"./integrations": {
|
|
27
|
+
"import": "./dist/integrations/index.js",
|
|
28
|
+
"types": "./dist/integrations/index.d.ts"
|
|
17
29
|
}
|
|
18
30
|
},
|
|
19
31
|
"files": [
|
|
@@ -92,7 +104,9 @@
|
|
|
92
104
|
"dependencies": {
|
|
93
105
|
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
94
106
|
"cheerio": "^1.0.0",
|
|
107
|
+
"cloakbrowser": "^0.1.8",
|
|
95
108
|
"commander": "^12.0.0",
|
|
109
|
+
"cycletls": "^2.0.5",
|
|
96
110
|
"lru-cache": "^11.0.2",
|
|
97
111
|
"mammoth": "^1.11.0",
|
|
98
112
|
"ora": "^8.0.1",
|