webpeel 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/LICENSE +11 -657
  2. package/README.md +246 -325
  3. package/dist/cli.js +330 -73
  4. package/dist/cli.js.map +1 -1
  5. package/dist/core/browser-fetch.d.ts +12 -0
  6. package/dist/core/browser-fetch.d.ts.map +1 -1
  7. package/dist/core/browser-fetch.js +70 -17
  8. package/dist/core/browser-fetch.js.map +1 -1
  9. package/dist/core/cf-worker-proxy.d.ts +33 -0
  10. package/dist/core/cf-worker-proxy.d.ts.map +1 -0
  11. package/dist/core/cf-worker-proxy.js +88 -0
  12. package/dist/core/cf-worker-proxy.js.map +1 -0
  13. package/dist/core/chunker.d.ts +47 -0
  14. package/dist/core/chunker.d.ts.map +1 -0
  15. package/dist/core/chunker.js +250 -0
  16. package/dist/core/chunker.js.map +1 -0
  17. package/dist/core/cloak-fetch.d.ts +43 -0
  18. package/dist/core/cloak-fetch.d.ts.map +1 -0
  19. package/dist/core/cloak-fetch.js +141 -0
  20. package/dist/core/cloak-fetch.js.map +1 -0
  21. package/dist/core/crawl-checkpoint.d.ts +55 -0
  22. package/dist/core/crawl-checkpoint.d.ts.map +1 -0
  23. package/dist/core/crawl-checkpoint.js +105 -0
  24. package/dist/core/crawl-checkpoint.js.map +1 -0
  25. package/dist/core/crawler.d.ts +5 -1
  26. package/dist/core/crawler.d.ts.map +1 -1
  27. package/dist/core/crawler.js +60 -5
  28. package/dist/core/crawler.js.map +1 -1
  29. package/dist/core/cycle-fetch.d.ts +27 -0
  30. package/dist/core/cycle-fetch.d.ts.map +1 -0
  31. package/dist/core/cycle-fetch.js +99 -0
  32. package/dist/core/cycle-fetch.js.map +1 -0
  33. package/dist/core/domain-extractors.d.ts.map +1 -1
  34. package/dist/core/domain-extractors.js +754 -14
  35. package/dist/core/domain-extractors.js.map +1 -1
  36. package/dist/core/google-cache.d.ts +30 -0
  37. package/dist/core/google-cache.d.ts.map +1 -0
  38. package/dist/core/google-cache.js +181 -0
  39. package/dist/core/google-cache.js.map +1 -0
  40. package/dist/core/markdown.d.ts +11 -0
  41. package/dist/core/markdown.d.ts.map +1 -1
  42. package/dist/core/markdown.js +43 -0
  43. package/dist/core/markdown.js.map +1 -1
  44. package/dist/core/peel-tls.d.ts +26 -0
  45. package/dist/core/peel-tls.d.ts.map +1 -0
  46. package/dist/core/peel-tls.js +221 -0
  47. package/dist/core/peel-tls.js.map +1 -0
  48. package/dist/core/pipeline.d.ts +5 -1
  49. package/dist/core/pipeline.d.ts.map +1 -1
  50. package/dist/core/pipeline.js +269 -21
  51. package/dist/core/pipeline.js.map +1 -1
  52. package/dist/core/schema-postprocess.d.ts +33 -0
  53. package/dist/core/schema-postprocess.d.ts.map +1 -0
  54. package/dist/core/schema-postprocess.js +470 -0
  55. package/dist/core/schema-postprocess.js.map +1 -0
  56. package/dist/core/schema-templates.d.ts +20 -0
  57. package/dist/core/schema-templates.d.ts.map +1 -0
  58. package/dist/core/schema-templates.js +131 -0
  59. package/dist/core/schema-templates.js.map +1 -0
  60. package/dist/core/search-fallback.d.ts +28 -0
  61. package/dist/core/search-fallback.d.ts.map +1 -0
  62. package/dist/core/search-fallback.js +185 -0
  63. package/dist/core/search-fallback.js.map +1 -0
  64. package/dist/core/search-provider.d.ts +47 -4
  65. package/dist/core/search-provider.d.ts.map +1 -1
  66. package/dist/core/search-provider.js +278 -7
  67. package/dist/core/search-provider.js.map +1 -1
  68. package/dist/core/stealth-patches.d.ts +58 -0
  69. package/dist/core/stealth-patches.d.ts.map +1 -0
  70. package/dist/core/stealth-patches.js +340 -0
  71. package/dist/core/stealth-patches.js.map +1 -0
  72. package/dist/core/strategies.d.ts +20 -0
  73. package/dist/core/strategies.d.ts.map +1 -1
  74. package/dist/core/strategies.js +284 -48
  75. package/dist/core/strategies.js.map +1 -1
  76. package/dist/core/strategy-hooks.d.ts +1 -1
  77. package/dist/core/strategy-hooks.d.ts.map +1 -1
  78. package/dist/index.d.ts +11 -0
  79. package/dist/index.d.ts.map +1 -1
  80. package/dist/index.js +37 -15
  81. package/dist/index.js.map +1 -1
  82. package/dist/mcp/server.js +109 -4
  83. package/dist/mcp/server.js.map +1 -1
  84. package/dist/server/app.d.ts.map +1 -1
  85. package/dist/server/app.js +29 -0
  86. package/dist/server/app.js.map +1 -1
  87. package/dist/server/middleware/rate-limit.d.ts +2 -1
  88. package/dist/server/middleware/rate-limit.d.ts.map +1 -1
  89. package/dist/server/middleware/rate-limit.js +24 -8
  90. package/dist/server/middleware/rate-limit.js.map +1 -1
  91. package/dist/server/routes/agent.d.ts +4 -0
  92. package/dist/server/routes/agent.d.ts.map +1 -1
  93. package/dist/server/routes/agent.js +196 -9
  94. package/dist/server/routes/agent.js.map +1 -1
  95. package/dist/server/routes/batch.js +5 -5
  96. package/dist/server/routes/batch.js.map +1 -1
  97. package/dist/server/routes/compat.d.ts.map +1 -1
  98. package/dist/server/routes/compat.js +1 -0
  99. package/dist/server/routes/compat.js.map +1 -1
  100. package/dist/server/routes/fetch.d.ts.map +1 -1
  101. package/dist/server/routes/fetch.js +60 -6
  102. package/dist/server/routes/fetch.js.map +1 -1
  103. package/dist/server/routes/mcp.d.ts.map +1 -1
  104. package/dist/server/routes/mcp.js +103 -2
  105. package/dist/server/routes/mcp.js.map +1 -1
  106. package/dist/server/routes/search.js +1 -1
  107. package/dist/server/routes/search.js.map +1 -1
  108. package/dist/types.d.ts +55 -4
  109. package/dist/types.d.ts.map +1 -1
  110. package/dist/types.js +4 -1
  111. package/dist/types.js.map +1 -1
  112. package/llms.txt +55 -125
  113. package/package.json +15 -1
package/llms.txt CHANGED
@@ -1,90 +1,60 @@
1
- # WebPeel
1
+ # WebPeel — Fast web data for AI agents
2
2
 
3
- > Version: 0.14.0 | 18 MCP tools | 927 tests | The web data API for AI agents. Fetch, search, crawl, extract, and research one tool, zero config.
4
-
5
- WebPeel is an open-source web fetcher that converts any URL to clean, AI-ready markdown. Smart escalation tries fast HTTP first (~150ms), auto-escalates to headless browser when needed, and uses stealth mode for heavily protected sites. v0.14.0 adds YouTube transcript extraction, domain-aware extractors (Twitter/X, Reddit, GitHub, HN), LLM-free BM25 Q&A, reader mode, auto-extract, deep fetch intelligence, and URL monitoring.
3
+ > Open source web fetcher, scraper, and data extractor. One call to fetch any URL as clean markdown, extract structured data, or search the web.
6
4
 
7
5
  ## Quick Start
8
6
 
9
- ```bash
10
- # CLI
11
- npx webpeel https://example.com
12
-
13
- # With browser rendering (JS-heavy sites)
14
- npx webpeel https://example.com --render
15
-
16
- # Search the web
17
- npx webpeel search "latest AI news"
18
-
19
- # Crawl a site
20
- npx webpeel crawl https://example.com --max-pages 20
21
-
22
- # Screenshot
23
- npx webpeel screenshot https://example.com --full-page
24
-
25
- # AI-powered answer with citations
26
- npx webpeel answer "What is WebPeel?" --llm openai
27
-
28
- # Research agent
29
- npx webpeel research "Compare React vs Vue in 2025" --llm-key $OPENAI_API_KEY
30
-
31
- # Library
32
- import { peel } from 'webpeel';
33
- const result = await peel('https://example.com');
34
-
35
- # MCP Server (Claude Desktop / Cursor / VS Code / Windsurf / Cline)
36
- npx webpeel mcp
37
- ```
38
-
39
- ## API Endpoints
40
-
41
- Base URL: `https://api.webpeel.dev`
42
-
43
- - `GET /v1/fetch?url=URL` Fetch a URL as markdown/text/HTML
44
- - `POST /v1/fetch` Fetch with actions, extraction, advanced options
45
- - `GET /v1/search?q=QUERY` Web search (DuckDuckGo free, Brave BYOK)
46
- - `POST /v1/crawl` — Crawl a website (async job with webhook)
47
- - `POST /v1/map` — Discover all URLs on a domain
48
- - `POST /v1/screenshot` — Screenshot a URL (PNG/JPEG, full-page)
49
- - `POST /v1/answer` Search + fetch + LLM answer with citations (BYOK)
50
- - `GET /v1/answer/quick?url=URL&question=Q` — BM25 Q&A, no LLM key needed
51
- - `POST /v1/deep-fetch` Search + batch fetch + BM25 merge + dedup. Params: query, count. No LLM key needed.
52
- - `GET /v1/youtube?url=YT_URL` Extract YouTube video transcript. All URL formats. No API key.
53
- - `GET /v1/extract/auto?url=URL` Auto-detect page type and extract structured JSON
54
- - `GET /v1/watch` List URL watchers
55
- - `POST /v1/watch` Create URL watcher with webhook notification
56
- - `POST /v1/agent` — Autonomous research agent (BYOK)
57
- - `POST /v1/batch` — Fetch multiple URLs in parallel
58
- - `POST /v2/scrape` — Firecrawl-compatible endpoint (drop-in replacement)
59
- - `GET /health` API health check
60
-
61
- Authentication: `Authorization: Bearer YOUR_API_KEY` or anonymous (500 free fetches/week, no signup required).
62
-
63
- ## MCP Tools
64
-
65
- - `webpeel_fetch` — Fetch a URL, return clean markdown. Params: url (required), render (boolean), wait (ms), format (markdown|text|html), actions (JSON array), readable (boolean)
66
- - `webpeel_search` — Search the web. Params: query (required), count (1-10), provider (duckduckgo|brave)
67
- - `webpeel_crawl` — Crawl a website. Params: url (required), maxPages (number), maxDepth (number)
68
- - `webpeel_map` — Discover URLs on a domain. Params: url (required), maxUrls (number)
69
- - `webpeel_extract` — Extract structured data. Params: url (required), schema (JSON Schema), prompt (string)
70
- - `webpeel_batch` — Fetch multiple URLs. Params: urls (array), concurrency (number)
71
- - `webpeel_screenshot` — Capture full-page or viewport screenshot. Params: url (required), fullPage (boolean), format (png|jpeg)
72
- - `webpeel_summarize` — Fetch and return condensed content summary. Params: url (required), focus (string)
73
- - `webpeel_answer` — Answer a question about any URL without BYOK. Params: url (required), question (required)
74
- - `webpeel_brand` — Extract brand info (name, logo, colors, social links). Params: url (required)
75
- - `webpeel_change_track` — Detect content changes (hash-based monitoring). Params: url (required), previousHash (string)
76
- - `webpeel_deep_fetch` — Search + batch fetch + BM25 merge. Params: query (required), count (1-20). Comparison mode auto-detected.
77
- - `webpeel_research` — Research agent. Params: prompt (required), urls (array), maxPages (number)
78
- - `webpeel_youtube` — Extract YouTube video transcripts. Params: url (required). Supports all YouTube URL formats (watch, youtu.be, embed, shorts). No API key needed.
79
- - `webpeel_auto_extract` — Heuristic structured data extraction — auto-detects page type (pricing, product, contact, article, api-docs). Params: url (required)
80
- - `webpeel_quick_answer` — BM25-powered Q&A, no LLM key needed. Params: url (required), question (required)
81
- - `webpeel_watch` — Persistent URL change monitoring with webhook notifications. Params: url (required), webhookUrl (required), schedule (cron string)
82
-
83
- ## MCP Configuration
84
-
85
- Works with: Claude Desktop, Cursor, VS Code (Cline), Windsurf, Continue.dev, OpenClaw, and any MCP client.
86
-
87
- ```json
7
+ npm install webpeel
8
+ npx webpeel "https://example.com"
9
+
10
+ ## API
11
+
12
+ POST https://api.webpeel.dev/v1/fetch
13
+ Content-Type: application/json
14
+ Authorization: Bearer wp_live_YOUR_KEY
15
+
16
+ {
17
+ "url": "https://example.com",
18
+ "format": "markdown"
19
+ }
20
+
21
+ ## CLI Commands
22
+
23
+ webpeel <url> Fetch URL as markdown
24
+ webpeel <url> --json Fetch as JSON with metadata
25
+ webpeel <url> --clean AI-optimized output (no URLs)
26
+ webpeel <url> -q "question?" Quick answer from page content
27
+ webpeel search "query" Web search
28
+ webpeel search "query" --site ebay Site-specific search
29
+ webpeel crawl <url> Crawl site pages
30
+ webpeel map <url> Discover site URLs
31
+ webpeel batch <file> Batch fetch URLs
32
+ webpeel pipe <url> Pipe-friendly JSON (no UI)
33
+ webpeel mcp Start MCP server
34
+
35
+ ## Key Features
36
+
37
+ - Domain-first extraction: Reddit, GitHub, Wikipedia, Twitter/X, HN, YouTube, ArXiv, StackOverflow, NPM — instant via API, no browser
38
+ - Smart content extraction: BM25 scoring, budget distillation, JSON-LD parsing
39
+ - Quick answers: Ask questions about any page (no LLM needed)
40
+ - Anti-bot handling: Stealth mode, proxy rotation, graceful degradation
41
+ - Format options: markdown, text, html, clean (AI-optimized)
42
+ - MCP server: 18 tools for AI agent integration
43
+ - Site search: Search eBay, Amazon, GitHub, and 20+ sites with structured output
44
+
45
+ ## Formats
46
+
47
+ | Format | Flag | Use Case |
48
+ |--------|------|----------|
49
+ | markdown | (default) | General use, documentation |
50
+ | text | --text | Plain text, no formatting |
51
+ | html | --html | Raw HTML preservation |
52
+ | clean | --clean | AI consumption (no URL noise) |
53
+ | json | --json | Programmatic access with metadata |
54
+
55
+ ## MCP Integration
56
+
57
+ Add to your MCP config:
88
58
  {
89
59
  "mcpServers": {
90
60
  "webpeel": {
@@ -93,50 +63,10 @@ Works with: Claude Desktop, Cursor, VS Code (Cline), Windsurf, Continue.dev, Ope
93
63
  }
94
64
  }
95
65
  }
96
- ```
97
-
98
- Hosted MCP (no local install): `https://api.webpeel.dev/mcp`
99
-
100
- ## Key Features
101
-
102
- - **Smart escalation**: HTTP (~150ms) → Playwright browser (~2s) → Stealth mode (~5s) — only escalates when needed
103
- - **Page actions**: Click, type, scroll, wait, press, select, hover before scraping
104
- - **Screenshot API**: Full-page or viewport, PNG/JPEG, custom dimensions
105
- - **PDF & DOCX parsing**: Feed a document URL, get clean markdown
106
- - **Structured extraction**: Pass a JSON Schema + your LLM key (BYOK), get structured data
107
- - **Branding extraction**: Extract colors, fonts, logos, and brand assets
108
- - **Change tracking**: Monitor content changes over time with fingerprint diffing
109
- - **Crawl & map**: Full site crawling with depth control, async jobs, webhooks
110
- - **Web search**: DuckDuckGo (free, no key) or Brave Search (BYOK)
111
- - **Answer endpoint**: Search + fetch + LLM-generated answer with citations
112
- - **Research agent**: Autonomous multi-page research with streaming
113
- - **Firecrawl-compatible**: Drop-in replacement — change one URL, your code works
114
- - **Anti-bot bypass**: Cloudflare, DataDome, JavaScript walls, 403s
115
- - **Token-optimized**: Strips navigation, ads, scripts, cookie banners
116
- - **SSRF protection**: Blocks private IPs, IPv6 mapped addresses, redirect attacks
117
- - **Open source**: AGPL-3.0 licensed, fully self-hostable
118
-
119
- ## Pricing
120
-
121
- - **Free**: 500 fetches/week, 50/hr burst — no credit card required
122
- - **Pro**: $9/mo — 1,250/week, 100/hr burst
123
- - **Max**: $29/mo — 6,250/week, 500/hr burst
124
- - All features on all plans (no feature-gating)
125
- - Extra usage: Basic $0.002, Stealth $0.01, Search $0.001 per credit
126
-
127
- ## SDKs & Integrations
128
-
129
- - **CLI**: `npm install -g webpeel`
130
- - **Python SDK**: `pip install webpeel` (zero deps)
131
- - **TypeScript/Node.js**: `npm install webpeel`
132
- - **LangChain**: WebPeelLoader integration
133
- - **LlamaIndex**: WebPeelReader integration
134
66
 
135
67
  ## Links
136
68
 
137
- - Website: https://webpeel.dev
138
- - API Docs: https://webpeel.dev/docs/api-reference
69
+ - Docs: https://webpeel.dev/docs
70
+ - API: https://api.webpeel.dev
139
71
  - GitHub: https://github.com/webpeel/webpeel
140
72
  - npm: https://www.npmjs.com/package/webpeel
141
- - Status: https://webpeel.dev/status
142
- - Changelog: https://webpeel.dev/changelog
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.16.0",
3
+ "version": "0.17.0",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",
@@ -14,6 +14,18 @@
14
14
  ".": {
15
15
  "import": "./dist/index.js",
16
16
  "types": "./dist/index.d.ts"
17
+ },
18
+ "./integrations/langchain": {
19
+ "import": "./dist/integrations/langchain.js",
20
+ "types": "./dist/integrations/langchain.d.ts"
21
+ },
22
+ "./integrations/llamaindex": {
23
+ "import": "./dist/integrations/llamaindex.js",
24
+ "types": "./dist/integrations/llamaindex.d.ts"
25
+ },
26
+ "./integrations": {
27
+ "import": "./dist/integrations/index.js",
28
+ "types": "./dist/integrations/index.d.ts"
17
29
  }
18
30
  },
19
31
  "files": [
@@ -92,7 +104,9 @@
92
104
  "dependencies": {
93
105
  "@modelcontextprotocol/sdk": "^1.0.4",
94
106
  "cheerio": "^1.0.0",
107
+ "cloakbrowser": "^0.1.8",
95
108
  "commander": "^12.0.0",
109
+ "cycletls": "^2.0.5",
96
110
  "lru-cache": "^11.0.2",
97
111
  "mammoth": "^1.11.0",
98
112
  "ora": "^8.0.1",