webpeel 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webpeel-0.1.0/LICENSE +21 -0
- webpeel-0.1.0/PKG-INFO +257 -0
- webpeel-0.1.0/README.md +229 -0
- webpeel-0.1.0/pyproject.toml +51 -0
- webpeel-0.1.0/setup.cfg +4 -0
- webpeel-0.1.0/tests/test_client.py +96 -0
- webpeel-0.1.0/webpeel/__init__.py +35 -0
- webpeel-0.1.0/webpeel/_version.py +3 -0
- webpeel-0.1.0/webpeel/client.py +409 -0
- webpeel-0.1.0/webpeel/exceptions.py +21 -0
- webpeel-0.1.0/webpeel/types.py +131 -0
- webpeel-0.1.0/webpeel.egg-info/PKG-INFO +257 -0
- webpeel-0.1.0/webpeel.egg-info/SOURCES.txt +13 -0
- webpeel-0.1.0/webpeel.egg-info/dependency_links.txt +1 -0
- webpeel-0.1.0/webpeel.egg-info/top_level.txt +1 -0
webpeel-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Jake Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
webpeel-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: webpeel
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast web fetcher for AI agents — smart extraction, stealth mode, structured data
|
|
5
|
+
Author-email: Jake Liu <jake@webpeel.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://webpeel.dev
|
|
8
|
+
Project-URL: Documentation, https://github.com/JakeLiuMe/webpeel
|
|
9
|
+
Project-URL: Repository, https://github.com/JakeLiuMe/webpeel
|
|
10
|
+
Project-URL: Issues, https://github.com/JakeLiuMe/webpeel/issues
|
|
11
|
+
Keywords: web-scraping,ai,llm,mcp,web-fetcher,markdown,scraper,crawler,ai-agents
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# WebPeel Python SDK
|
|
30
|
+
|
|
31
|
+
**Fast web fetcher for AI agents** — smart extraction, stealth mode, structured data.
|
|
32
|
+
|
|
33
|
+
Zero dependencies. Pure Python 3.8+ stdlib.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install webpeel
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### Basic Scraping
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from webpeel import WebPeel
|
|
47
|
+
|
|
48
|
+
client = WebPeel()
|
|
49
|
+
|
|
50
|
+
# Scrape a URL and get clean markdown
|
|
51
|
+
result = client.scrape("https://example.com")
|
|
52
|
+
print(result.title)
|
|
53
|
+
print(result.content) # Clean markdown content
|
|
54
|
+
print(result.metadata) # Structured metadata
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Search the Web
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
# Search via DuckDuckGo
|
|
61
|
+
results = client.search("python web scraping")
|
|
62
|
+
|
|
63
|
+
for item in results.data.get("web", []):
|
|
64
|
+
print(f"{item['title']}: {item['url']}")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### JavaScript-Heavy Sites
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# Use browser rendering for SPAs and JS-heavy sites
|
|
71
|
+
result = client.scrape(
|
|
72
|
+
"https://twitter.com/elonmusk",
|
|
73
|
+
render=True, # Enable browser mode
|
|
74
|
+
wait=2000, # Wait 2s for JS to load
|
|
75
|
+
)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Stealth Mode (Bypass Bot Detection)
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# Bypass Cloudflare, reCAPTCHA, and anti-bot systems
|
|
82
|
+
result = client.scrape(
|
|
83
|
+
"https://protected-site.com",
|
|
84
|
+
stealth=True, # Enable stealth mode
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Structured Data Extraction
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
# Extract specific data using CSS selectors
|
|
92
|
+
result = client.scrape(
|
|
93
|
+
"https://amazon.com/product/...",
|
|
94
|
+
extract={
|
|
95
|
+
"selectors": {
|
|
96
|
+
"title": "h1#title",
|
|
97
|
+
"price": "span.price",
|
|
98
|
+
"rating": ".review-rating",
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(result.extracted)
|
|
104
|
+
# {"title": "Product Name", "price": "$29.99", "rating": "4.5"}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Crawl a Website
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
# Start an async crawl job (requires API key)
|
|
111
|
+
client = WebPeel(api_key="your-api-key")
|
|
112
|
+
|
|
113
|
+
job = client.crawl(
|
|
114
|
+
"https://docs.example.com",
|
|
115
|
+
limit=100,
|
|
116
|
+
max_depth=3,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
print(job.id) # Job ID for tracking
|
|
120
|
+
|
|
121
|
+
# Check status later
|
|
122
|
+
status = client.get_job(job.id)
|
|
123
|
+
print(status["status"]) # pending, running, completed, failed
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Map a Domain
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
# Discover all URLs on a domain
|
|
130
|
+
result = client.map("https://example.com")
|
|
131
|
+
|
|
132
|
+
print(f"Found {result.total} URLs")
|
|
133
|
+
for url in result.urls[:10]:
|
|
134
|
+
print(url)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Batch Scraping
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
# Scrape multiple URLs in batch (requires API key)
|
|
141
|
+
client = WebPeel(api_key="your-api-key")
|
|
142
|
+
|
|
143
|
+
urls = [
|
|
144
|
+
"https://example.com/page1",
|
|
145
|
+
"https://example.com/page2",
|
|
146
|
+
"https://example.com/page3",
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
job = client.batch_scrape(urls, max_tokens=5000)
|
|
150
|
+
print(job.id)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## API Reference
|
|
154
|
+
|
|
155
|
+
### WebPeel Class
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
WebPeel(
|
|
159
|
+
api_key: Optional[str] = None,
|
|
160
|
+
base_url: str = "https://api.webpeel.dev",
|
|
161
|
+
timeout: int = 30,
|
|
162
|
+
)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
- **`api_key`**: API key for authentication (optional for free tier)
|
|
166
|
+
- **`base_url`**: Base URL for the WebPeel API
|
|
167
|
+
- **`timeout`**: Request timeout in seconds
|
|
168
|
+
|
|
169
|
+
### Methods
|
|
170
|
+
|
|
171
|
+
#### `scrape(url, **options) -> ScrapeResult`
|
|
172
|
+
|
|
173
|
+
Scrape a URL and extract content.
|
|
174
|
+
|
|
175
|
+
**Options:**
|
|
176
|
+
- `formats`: Output formats (default: `["markdown"]`)
|
|
177
|
+
- `max_tokens`: Maximum token count for output
|
|
178
|
+
- `render`: Use headless browser (default: `False`)
|
|
179
|
+
- `stealth`: Bypass bot detection (default: `False`)
|
|
180
|
+
- `wait`: Wait time in ms after page load
|
|
181
|
+
- `extract`: Structured data extraction config
|
|
182
|
+
- `headers`: Custom HTTP headers
|
|
183
|
+
|
|
184
|
+
#### `search(query, limit=5) -> SearchResult`
|
|
185
|
+
|
|
186
|
+
Search the web via DuckDuckGo.
|
|
187
|
+
|
|
188
|
+
#### `crawl(url, limit=50, max_depth=3) -> CrawlResult`
|
|
189
|
+
|
|
190
|
+
Start an async crawl job (requires API key).
|
|
191
|
+
|
|
192
|
+
#### `map(url) -> MapResult`
|
|
193
|
+
|
|
194
|
+
Discover all URLs on a domain.
|
|
195
|
+
|
|
196
|
+
#### `batch_scrape(urls, **options) -> BatchResult`
|
|
197
|
+
|
|
198
|
+
Batch scrape multiple URLs (requires API key).
|
|
199
|
+
|
|
200
|
+
#### `get_job(job_id) -> Dict`
|
|
201
|
+
|
|
202
|
+
Check status of an async job.
|
|
203
|
+
|
|
204
|
+
## WebPeel vs Firecrawl
|
|
205
|
+
|
|
206
|
+
| Feature | WebPeel | Firecrawl |
|
|
207
|
+
|---------|---------|-----------|
|
|
208
|
+
| **Pricing** | $0 local / $9-$29 cloud | $16-$333/mo |
|
|
209
|
+
| **Free Tier** | 125 fetches/week | 500 credits one-time |
|
|
210
|
+
| **License** | MIT | AGPL-3.0 |
|
|
211
|
+
| **Python SDK Deps** | Zero (pure stdlib) | httpx, pydantic |
|
|
212
|
+
| **Smart Escalation** | ✅ Auto HTTP→Browser→Stealth | Manual mode selection |
|
|
213
|
+
| **Token Budget** | ✅ `--max-tokens` | ❌ |
|
|
214
|
+
| **Quality Scoring** | ✅ 0-1 per response | ❌ |
|
|
215
|
+
| **Local CLI** | ✅ Free, unlimited | Requires API key |
|
|
216
|
+
| **LangChain** | ✅ | ✅ |
|
|
217
|
+
| **LlamaIndex** | ✅ | ✅ |
|
|
218
|
+
|
|
219
|
+
**WebPeel is the free, fast, MIT-licensed alternative to Firecrawl.**
|
|
220
|
+
|
|
221
|
+
## Authentication
|
|
222
|
+
|
|
223
|
+
Free tier: No API key needed. Anonymous usage with rate limits.
|
|
224
|
+
|
|
225
|
+
Paid tier: Get an API key at [webpeel.dev](https://webpeel.dev).
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
client = WebPeel(api_key="wp_...")
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Error Handling
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
from webpeel import WebPeel, WebPeelError, RateLimitError, TimeoutError
|
|
235
|
+
|
|
236
|
+
client = WebPeel()
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
result = client.scrape("https://example.com")
|
|
240
|
+
except RateLimitError:
|
|
241
|
+
print("Rate limit exceeded. Upgrade or wait.")
|
|
242
|
+
except TimeoutError:
|
|
243
|
+
print("Request timeout. Try again.")
|
|
244
|
+
except WebPeelError as e:
|
|
245
|
+
print(f"Error: {e}")
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
|
|
250
|
+
MIT © Jake Liu
|
|
251
|
+
|
|
252
|
+
## Links
|
|
253
|
+
|
|
254
|
+
- [Homepage](https://webpeel.dev)
|
|
255
|
+
- [Documentation](https://github.com/JakeLiuMe/webpeel)
|
|
256
|
+
- [GitHub](https://github.com/JakeLiuMe/webpeel)
|
|
257
|
+
- [Issues](https://github.com/JakeLiuMe/webpeel/issues)
|
webpeel-0.1.0/README.md
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# WebPeel Python SDK
|
|
2
|
+
|
|
3
|
+
**Fast web fetcher for AI agents** — smart extraction, stealth mode, structured data.
|
|
4
|
+
|
|
5
|
+
Zero dependencies. Pure Python 3.8+ stdlib.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install webpeel
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
### Basic Scraping
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from webpeel import WebPeel
|
|
19
|
+
|
|
20
|
+
client = WebPeel()
|
|
21
|
+
|
|
22
|
+
# Scrape a URL and get clean markdown
|
|
23
|
+
result = client.scrape("https://example.com")
|
|
24
|
+
print(result.title)
|
|
25
|
+
print(result.content) # Clean markdown content
|
|
26
|
+
print(result.metadata) # Structured metadata
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Search the Web
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
# Search via DuckDuckGo
|
|
33
|
+
results = client.search("python web scraping")
|
|
34
|
+
|
|
35
|
+
for item in results.data.get("web", []):
|
|
36
|
+
print(f"{item['title']}: {item['url']}")
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### JavaScript-Heavy Sites
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
# Use browser rendering for SPAs and JS-heavy sites
|
|
43
|
+
result = client.scrape(
|
|
44
|
+
"https://twitter.com/elonmusk",
|
|
45
|
+
render=True, # Enable browser mode
|
|
46
|
+
wait=2000, # Wait 2s for JS to load
|
|
47
|
+
)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Stealth Mode (Bypass Bot Detection)
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
# Bypass Cloudflare, reCAPTCHA, and anti-bot systems
|
|
54
|
+
result = client.scrape(
|
|
55
|
+
"https://protected-site.com",
|
|
56
|
+
stealth=True, # Enable stealth mode
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Structured Data Extraction
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
# Extract specific data using CSS selectors
|
|
64
|
+
result = client.scrape(
|
|
65
|
+
"https://amazon.com/product/...",
|
|
66
|
+
extract={
|
|
67
|
+
"selectors": {
|
|
68
|
+
"title": "h1#title",
|
|
69
|
+
"price": "span.price",
|
|
70
|
+
"rating": ".review-rating",
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
print(result.extracted)
|
|
76
|
+
# {"title": "Product Name", "price": "$29.99", "rating": "4.5"}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Crawl a Website
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
# Start an async crawl job (requires API key)
|
|
83
|
+
client = WebPeel(api_key="your-api-key")
|
|
84
|
+
|
|
85
|
+
job = client.crawl(
|
|
86
|
+
"https://docs.example.com",
|
|
87
|
+
limit=100,
|
|
88
|
+
max_depth=3,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
print(job.id) # Job ID for tracking
|
|
92
|
+
|
|
93
|
+
# Check status later
|
|
94
|
+
status = client.get_job(job.id)
|
|
95
|
+
print(status["status"]) # pending, running, completed, failed
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Map a Domain
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# Discover all URLs on a domain
|
|
102
|
+
result = client.map("https://example.com")
|
|
103
|
+
|
|
104
|
+
print(f"Found {result.total} URLs")
|
|
105
|
+
for url in result.urls[:10]:
|
|
106
|
+
print(url)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Batch Scraping
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
# Scrape multiple URLs in batch (requires API key)
|
|
113
|
+
client = WebPeel(api_key="your-api-key")
|
|
114
|
+
|
|
115
|
+
urls = [
|
|
116
|
+
"https://example.com/page1",
|
|
117
|
+
"https://example.com/page2",
|
|
118
|
+
"https://example.com/page3",
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
job = client.batch_scrape(urls, max_tokens=5000)
|
|
122
|
+
print(job.id)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## API Reference
|
|
126
|
+
|
|
127
|
+
### WebPeel Class
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
WebPeel(
|
|
131
|
+
api_key: Optional[str] = None,
|
|
132
|
+
base_url: str = "https://api.webpeel.dev",
|
|
133
|
+
timeout: int = 30,
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
- **`api_key`**: API key for authentication (optional for free tier)
|
|
138
|
+
- **`base_url`**: Base URL for the WebPeel API
|
|
139
|
+
- **`timeout`**: Request timeout in seconds
|
|
140
|
+
|
|
141
|
+
### Methods
|
|
142
|
+
|
|
143
|
+
#### `scrape(url, **options) -> ScrapeResult`
|
|
144
|
+
|
|
145
|
+
Scrape a URL and extract content.
|
|
146
|
+
|
|
147
|
+
**Options:**
|
|
148
|
+
- `formats`: Output formats (default: `["markdown"]`)
|
|
149
|
+
- `max_tokens`: Maximum token count for output
|
|
150
|
+
- `render`: Use headless browser (default: `False`)
|
|
151
|
+
- `stealth`: Bypass bot detection (default: `False`)
|
|
152
|
+
- `wait`: Wait time in ms after page load
|
|
153
|
+
- `extract`: Structured data extraction config
|
|
154
|
+
- `headers`: Custom HTTP headers
|
|
155
|
+
|
|
156
|
+
#### `search(query, limit=5) -> SearchResult`
|
|
157
|
+
|
|
158
|
+
Search the web via DuckDuckGo.
|
|
159
|
+
|
|
160
|
+
#### `crawl(url, limit=50, max_depth=3) -> CrawlResult`
|
|
161
|
+
|
|
162
|
+
Start an async crawl job (requires API key).
|
|
163
|
+
|
|
164
|
+
#### `map(url) -> MapResult`
|
|
165
|
+
|
|
166
|
+
Discover all URLs on a domain.
|
|
167
|
+
|
|
168
|
+
#### `batch_scrape(urls, **options) -> BatchResult`
|
|
169
|
+
|
|
170
|
+
Batch scrape multiple URLs (requires API key).
|
|
171
|
+
|
|
172
|
+
#### `get_job(job_id) -> Dict`
|
|
173
|
+
|
|
174
|
+
Check status of an async job.
|
|
175
|
+
|
|
176
|
+
## WebPeel vs Firecrawl
|
|
177
|
+
|
|
178
|
+
| Feature | WebPeel | Firecrawl |
|
|
179
|
+
|---------|---------|-----------|
|
|
180
|
+
| **Pricing** | $0 local / $9-$29 cloud | $16-$333/mo |
|
|
181
|
+
| **Free Tier** | 125 fetches/week | 500 credits one-time |
|
|
182
|
+
| **License** | MIT | AGPL-3.0 |
|
|
183
|
+
| **Python SDK Deps** | Zero (pure stdlib) | httpx, pydantic |
|
|
184
|
+
| **Smart Escalation** | ✅ Auto HTTP→Browser→Stealth | Manual mode selection |
|
|
185
|
+
| **Token Budget** | ✅ `--max-tokens` | ❌ |
|
|
186
|
+
| **Quality Scoring** | ✅ 0-1 per response | ❌ |
|
|
187
|
+
| **Local CLI** | ✅ Free, unlimited | Requires API key |
|
|
188
|
+
| **LangChain** | ✅ | ✅ |
|
|
189
|
+
| **LlamaIndex** | ✅ | ✅ |
|
|
190
|
+
|
|
191
|
+
**WebPeel is the free, fast, MIT-licensed alternative to Firecrawl.**
|
|
192
|
+
|
|
193
|
+
## Authentication
|
|
194
|
+
|
|
195
|
+
Free tier: No API key needed. Anonymous usage with rate limits.
|
|
196
|
+
|
|
197
|
+
Paid tier: Get an API key at [webpeel.dev](https://webpeel.dev).
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
client = WebPeel(api_key="wp_...")
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Error Handling
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from webpeel import WebPeel, WebPeelError, RateLimitError, TimeoutError
|
|
207
|
+
|
|
208
|
+
client = WebPeel()
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
result = client.scrape("https://example.com")
|
|
212
|
+
except RateLimitError:
|
|
213
|
+
print("Rate limit exceeded. Upgrade or wait.")
|
|
214
|
+
except TimeoutError:
|
|
215
|
+
print("Request timeout. Try again.")
|
|
216
|
+
except WebPeelError as e:
|
|
217
|
+
print(f"Error: {e}")
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## License
|
|
221
|
+
|
|
222
|
+
MIT © Jake Liu
|
|
223
|
+
|
|
224
|
+
## Links
|
|
225
|
+
|
|
226
|
+
- [Homepage](https://webpeel.dev)
|
|
227
|
+
- [Documentation](https://github.com/JakeLiuMe/webpeel)
|
|
228
|
+
- [GitHub](https://github.com/JakeLiuMe/webpeel)
|
|
229
|
+
- [Issues](https://github.com/JakeLiuMe/webpeel/issues)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "webpeel"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fast web fetcher for AI agents — smart extraction, stealth mode, structured data"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Jake Liu", email = "jake@webpeel.dev"}
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"web-scraping",
|
|
17
|
+
"ai",
|
|
18
|
+
"llm",
|
|
19
|
+
"mcp",
|
|
20
|
+
"web-fetcher",
|
|
21
|
+
"markdown",
|
|
22
|
+
"scraper",
|
|
23
|
+
"crawler",
|
|
24
|
+
"ai-agents",
|
|
25
|
+
]
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Development Status :: 4 - Beta",
|
|
28
|
+
"Intended Audience :: Developers",
|
|
29
|
+
"License :: OSI Approved :: MIT License",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3.8",
|
|
32
|
+
"Programming Language :: Python :: 3.9",
|
|
33
|
+
"Programming Language :: Python :: 3.10",
|
|
34
|
+
"Programming Language :: Python :: 3.11",
|
|
35
|
+
"Programming Language :: Python :: 3.12",
|
|
36
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
37
|
+
"Topic :: Software Development :: Libraries",
|
|
38
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
39
|
+
]
|
|
40
|
+
dependencies = []
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://webpeel.dev"
|
|
44
|
+
Documentation = "https://github.com/JakeLiuMe/webpeel"
|
|
45
|
+
Repository = "https://github.com/JakeLiuMe/webpeel"
|
|
46
|
+
Issues = "https://github.com/JakeLiuMe/webpeel/issues"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["."]
|
|
50
|
+
include = ["webpeel*"]
|
|
51
|
+
exclude = ["tests*"]
|
webpeel-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Basic tests for WebPeel SDK."""
|
|
2
|
+
|
|
3
|
+
import unittest
|
|
4
|
+
from unittest.mock import patch, MagicMock
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from webpeel import WebPeel, ScrapeResult, WebPeelError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestWebPeel(unittest.TestCase):
|
|
11
|
+
"""Test WebPeel client."""
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
"""Set up test client."""
|
|
15
|
+
self.client = WebPeel(api_key="test-key")
|
|
16
|
+
|
|
17
|
+
def test_init(self):
|
|
18
|
+
"""Test client initialization."""
|
|
19
|
+
self.assertEqual(self.client.api_key, "test-key")
|
|
20
|
+
self.assertEqual(self.client.base_url, "https://api.webpeel.dev")
|
|
21
|
+
self.assertEqual(self.client.timeout, 30)
|
|
22
|
+
|
|
23
|
+
def test_init_custom(self):
|
|
24
|
+
"""Test client with custom settings."""
|
|
25
|
+
client = WebPeel(
|
|
26
|
+
api_key="custom-key",
|
|
27
|
+
base_url="https://custom.api",
|
|
28
|
+
timeout=60,
|
|
29
|
+
)
|
|
30
|
+
self.assertEqual(client.api_key, "custom-key")
|
|
31
|
+
self.assertEqual(client.base_url, "https://custom.api")
|
|
32
|
+
self.assertEqual(client.timeout, 60)
|
|
33
|
+
|
|
34
|
+
@patch('urllib.request.urlopen')
|
|
35
|
+
def test_scrape_basic(self, mock_urlopen):
|
|
36
|
+
"""Test basic scrape."""
|
|
37
|
+
# Mock response
|
|
38
|
+
mock_response = MagicMock()
|
|
39
|
+
mock_response.read.return_value = json.dumps({
|
|
40
|
+
"url": "https://example.com",
|
|
41
|
+
"title": "Example Domain",
|
|
42
|
+
"content": "# Example Domain\n\nThis is an example.",
|
|
43
|
+
"metadata": {"description": "Example site"},
|
|
44
|
+
"links": ["https://example.com/about"],
|
|
45
|
+
"tokens": 100,
|
|
46
|
+
"method": "simple",
|
|
47
|
+
"elapsed": 250,
|
|
48
|
+
}).encode('utf-8')
|
|
49
|
+
mock_response.__enter__ = MagicMock(return_value=mock_response)
|
|
50
|
+
mock_response.__exit__ = MagicMock(return_value=False)
|
|
51
|
+
|
|
52
|
+
mock_urlopen.return_value = mock_response
|
|
53
|
+
|
|
54
|
+
# Call scrape
|
|
55
|
+
result = self.client.scrape("https://example.com")
|
|
56
|
+
|
|
57
|
+
# Verify result
|
|
58
|
+
self.assertIsInstance(result, ScrapeResult)
|
|
59
|
+
self.assertEqual(result.url, "https://example.com")
|
|
60
|
+
self.assertEqual(result.title, "Example Domain")
|
|
61
|
+
self.assertEqual(result.method, "simple")
|
|
62
|
+
self.assertEqual(result.tokens, 100)
|
|
63
|
+
self.assertIn("Example Domain", result.content)
|
|
64
|
+
|
|
65
|
+
@patch('urllib.request.urlopen')
|
|
66
|
+
def test_scrape_with_options(self, mock_urlopen):
|
|
67
|
+
"""Test scrape with options."""
|
|
68
|
+
mock_response = MagicMock()
|
|
69
|
+
mock_response.read.return_value = json.dumps({
|
|
70
|
+
"url": "https://example.com",
|
|
71
|
+
"title": "Example",
|
|
72
|
+
"content": "Content",
|
|
73
|
+
"metadata": {},
|
|
74
|
+
"links": [],
|
|
75
|
+
"tokens": 50,
|
|
76
|
+
"method": "browser",
|
|
77
|
+
"elapsed": 1500,
|
|
78
|
+
}).encode('utf-8')
|
|
79
|
+
mock_response.__enter__ = MagicMock(return_value=mock_response)
|
|
80
|
+
mock_response.__exit__ = MagicMock(return_value=False)
|
|
81
|
+
|
|
82
|
+
mock_urlopen.return_value = mock_response
|
|
83
|
+
|
|
84
|
+
result = self.client.scrape(
|
|
85
|
+
"https://example.com",
|
|
86
|
+
render=True,
|
|
87
|
+
wait=2000,
|
|
88
|
+
max_tokens=5000,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.assertEqual(result.method, "browser")
|
|
92
|
+
self.assertEqual(result.url, "https://example.com")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if __name__ == '__main__':
|
|
96
|
+
unittest.main()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WebPeel Python SDK
|
|
3
|
+
|
|
4
|
+
Fast web fetcher for AI agents — smart extraction, stealth mode, structured data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .client import WebPeel
|
|
8
|
+
from .types import (
|
|
9
|
+
ScrapeResult,
|
|
10
|
+
SearchResult,
|
|
11
|
+
CrawlResult,
|
|
12
|
+
MapResult,
|
|
13
|
+
BatchResult,
|
|
14
|
+
)
|
|
15
|
+
from .exceptions import (
|
|
16
|
+
WebPeelError,
|
|
17
|
+
AuthError,
|
|
18
|
+
RateLimitError,
|
|
19
|
+
TimeoutError,
|
|
20
|
+
)
|
|
21
|
+
from ._version import __version__
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"WebPeel",
|
|
25
|
+
"ScrapeResult",
|
|
26
|
+
"SearchResult",
|
|
27
|
+
"CrawlResult",
|
|
28
|
+
"MapResult",
|
|
29
|
+
"BatchResult",
|
|
30
|
+
"WebPeelError",
|
|
31
|
+
"AuthError",
|
|
32
|
+
"RateLimitError",
|
|
33
|
+
"TimeoutError",
|
|
34
|
+
"__version__",
|
|
35
|
+
]
|