crawl4ai-cloud-sdk 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawl4ai_cloud_sdk-0.2.0/PKG-INFO +216 -0
- crawl4ai_cloud_sdk-0.2.0/README.md +187 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud/__init__.py +100 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud/_client.py +190 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud/configs.py +523 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud/crawler.py +779 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud/errors.py +91 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud/models.py +502 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud_sdk.egg-info/PKG-INFO +216 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud_sdk.egg-info/SOURCES.txt +15 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud_sdk.egg-info/dependency_links.txt +1 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud_sdk.egg-info/requires.txt +5 -0
- crawl4ai_cloud_sdk-0.2.0/crawl4ai_cloud_sdk.egg-info/top_level.txt +1 -0
- crawl4ai_cloud_sdk-0.2.0/pyproject.toml +55 -0
- crawl4ai_cloud_sdk-0.2.0/setup.cfg +4 -0
- crawl4ai_cloud_sdk-0.2.0/tests/test_crawl.py +211 -0
- crawl4ai_cloud_sdk-0.2.0/tests/test_e2e.py +1065 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawl4ai-cloud-sdk
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Lightweight cloud SDK for Crawl4AI - mirrors the OSS API
|
|
5
|
+
Author-email: Unclecode <unclecode@kidocode.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://api.crawl4ai.com
|
|
8
|
+
Project-URL: Documentation, https://api.crawl4ai.com/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/unclecode/crawl4ai-cloud-sdk
|
|
10
|
+
Project-URL: Issues, https://github.com/unclecode/crawl4ai-cloud-sdk/issues
|
|
11
|
+
Project-URL: Discord, https://discord.gg/jP8KfhDhyN
|
|
12
|
+
Keywords: crawl4ai,web-scraping,crawler,cloud,api,web-crawler,scraping,markdown
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
Requires-Dist: httpx>=0.27.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
29
|
+
|
|
30
|
+
# Crawl4AI Cloud SDK for Python
|
|
31
|
+
|
|
32
|
+
Lightweight Python SDK for [Crawl4AI Cloud](https://api.crawl4ai.com). Mirrors the OSS API exactly.
|
|
33
|
+
|
|
34
|
+
> **Note:** This SDK is for **Crawl4AI Cloud** (api.crawl4ai.com), the managed cloud service. For the self-hosted open-source version, see [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai).
|
|
35
|
+
|
|
36
|
+
[](https://badge.fury.io/py/crawl4ai-cloud)
|
|
37
|
+
[](https://pypi.org/project/crawl4ai-cloud/)
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# From PyPI (coming soon)
|
|
43
|
+
pip install crawl4ai-cloud
|
|
44
|
+
|
|
45
|
+
# From GitHub (available now)
|
|
46
|
+
pip install git+https://github.com/unclecode/crawl4ai-cloud-sdk.git#subdirectory=python
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Get Your API Key
|
|
50
|
+
|
|
51
|
+
1. Go to [api.crawl4ai.com](https://api.crawl4ai.com)
|
|
52
|
+
2. Sign up and get your API key
|
|
53
|
+
|
|
54
|
+
## Quick Start
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
import asyncio
|
|
58
|
+
from crawl4ai_cloud import AsyncWebCrawler
|
|
59
|
+
|
|
60
|
+
async def main():
|
|
61
|
+
async with AsyncWebCrawler(api_key="sk_live_...") as crawler:
|
|
62
|
+
result = await crawler.run("https://example.com")
|
|
63
|
+
print(result.markdown.raw_markdown)
|
|
64
|
+
|
|
65
|
+
asyncio.run(main())
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Features
|
|
69
|
+
|
|
70
|
+
### Single URL Crawl
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
result = await crawler.run("https://example.com")
|
|
74
|
+
print(result.success)
|
|
75
|
+
print(result.markdown.raw_markdown)
|
|
76
|
+
print(result.html)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Batch Crawl
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
urls = ["https://example.com", "https://httpbin.org/html"]
|
|
83
|
+
|
|
84
|
+
# Wait for results
|
|
85
|
+
results = await crawler.run_many(urls, wait=True)
|
|
86
|
+
for r in results:
|
|
87
|
+
print(f"{r.url}: {r.success}")
|
|
88
|
+
|
|
89
|
+
# Fire and forget (returns job)
|
|
90
|
+
job = await crawler.run_many(urls, wait=False)
|
|
91
|
+
print(f"Job ID: {job.id}")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Configuration
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from crawl4ai_cloud import CrawlerRunConfig, BrowserConfig
|
|
98
|
+
|
|
99
|
+
config = CrawlerRunConfig(
|
|
100
|
+
word_count_threshold=10,
|
|
101
|
+
exclude_external_links=True,
|
|
102
|
+
screenshot=True,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
browser_config = BrowserConfig(
|
|
106
|
+
viewport_width=1920,
|
|
107
|
+
viewport_height=1080,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
result = await crawler.run(
|
|
111
|
+
"https://example.com",
|
|
112
|
+
config=config,
|
|
113
|
+
browser_config=browser_config,
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Proxy Support
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
# Shorthand
|
|
121
|
+
result = await crawler.run(url, proxy="datacenter")
|
|
122
|
+
result = await crawler.run(url, proxy="residential")
|
|
123
|
+
|
|
124
|
+
# Full config
|
|
125
|
+
result = await crawler.run(url, proxy={
|
|
126
|
+
"mode": "residential",
|
|
127
|
+
"country": "US"
|
|
128
|
+
})
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Deep Crawl
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
result = await crawler.deep_crawl(
|
|
135
|
+
"https://docs.example.com",
|
|
136
|
+
strategy="bfs",
|
|
137
|
+
max_depth=2,
|
|
138
|
+
max_urls=50,
|
|
139
|
+
wait=True,
|
|
140
|
+
)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Job Management
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
# List jobs
|
|
147
|
+
jobs = await crawler.list_jobs(status="completed", limit=10)
|
|
148
|
+
|
|
149
|
+
# Get job status
|
|
150
|
+
job = await crawler.get_job(job_id)
|
|
151
|
+
|
|
152
|
+
# Wait for job
|
|
153
|
+
job = await crawler.wait_job(job_id, poll_interval=2.0)
|
|
154
|
+
|
|
155
|
+
# Cancel job
|
|
156
|
+
await crawler.cancel_job(job_id)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Migration from OSS
|
|
160
|
+
|
|
161
|
+
Zero learning curve — your existing code works:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
# Before (OSS)
|
|
165
|
+
from crawl4ai import AsyncWebCrawler
|
|
166
|
+
async with AsyncWebCrawler() as crawler:
|
|
167
|
+
result = await crawler.arun(url)
|
|
168
|
+
|
|
169
|
+
# After (Cloud)
|
|
170
|
+
from crawl4ai_cloud import AsyncWebCrawler
|
|
171
|
+
async with AsyncWebCrawler(api_key="sk_...") as crawler:
|
|
172
|
+
result = await crawler.run(url) # arun() also works!
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Environment Variables
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
export CRAWL4AI_API_KEY=sk_live_...
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
# API key auto-loaded from environment
|
|
183
|
+
crawler = AsyncWebCrawler()
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Error Handling
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from crawl4ai_cloud import (
|
|
190
|
+
CloudError,
|
|
191
|
+
AuthenticationError,
|
|
192
|
+
RateLimitError,
|
|
193
|
+
QuotaExceededError,
|
|
194
|
+
NotFoundError,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
result = await crawler.run(url)
|
|
199
|
+
except AuthenticationError:
|
|
200
|
+
print("Invalid API key")
|
|
201
|
+
except RateLimitError as e:
|
|
202
|
+
print(f"Rate limited. Retry after {e.retry_after}s")
|
|
203
|
+
except QuotaExceededError:
|
|
204
|
+
print("Quota exceeded")
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Links
|
|
208
|
+
|
|
209
|
+
- [Cloud Dashboard](https://api.crawl4ai.com) - Sign up & get your API key
|
|
210
|
+
- [Cloud API Docs](https://api.crawl4ai.com/docs) - Full API reference
|
|
211
|
+
- [OSS Repository](https://github.com/unclecode/crawl4ai) - Self-hosted option
|
|
212
|
+
- [Discord](https://discord.gg/jP8KfhDhyN) - Community & support
|
|
213
|
+
|
|
214
|
+
## License
|
|
215
|
+
|
|
216
|
+
Apache 2.0
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# Crawl4AI Cloud SDK for Python
|
|
2
|
+
|
|
3
|
+
Lightweight Python SDK for [Crawl4AI Cloud](https://api.crawl4ai.com). Mirrors the OSS API exactly.
|
|
4
|
+
|
|
5
|
+
> **Note:** This SDK is for **Crawl4AI Cloud** (api.crawl4ai.com), the managed cloud service. For the self-hosted open-source version, see [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai).
|
|
6
|
+
|
|
7
|
+
[](https://badge.fury.io/py/crawl4ai-cloud)
|
|
8
|
+
[](https://pypi.org/project/crawl4ai-cloud/)
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
# From PyPI (coming soon)
|
|
14
|
+
pip install crawl4ai-cloud
|
|
15
|
+
|
|
16
|
+
# From GitHub (available now)
|
|
17
|
+
pip install git+https://github.com/unclecode/crawl4ai-cloud-sdk.git#subdirectory=python
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Get Your API Key
|
|
21
|
+
|
|
22
|
+
1. Go to [api.crawl4ai.com](https://api.crawl4ai.com)
|
|
23
|
+
2. Sign up and get your API key
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import asyncio
|
|
29
|
+
from crawl4ai_cloud import AsyncWebCrawler
|
|
30
|
+
|
|
31
|
+
async def main():
|
|
32
|
+
async with AsyncWebCrawler(api_key="sk_live_...") as crawler:
|
|
33
|
+
result = await crawler.run("https://example.com")
|
|
34
|
+
print(result.markdown.raw_markdown)
|
|
35
|
+
|
|
36
|
+
asyncio.run(main())
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
### Single URL Crawl
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
result = await crawler.run("https://example.com")
|
|
45
|
+
print(result.success)
|
|
46
|
+
print(result.markdown.raw_markdown)
|
|
47
|
+
print(result.html)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Batch Crawl
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
urls = ["https://example.com", "https://httpbin.org/html"]
|
|
54
|
+
|
|
55
|
+
# Wait for results
|
|
56
|
+
results = await crawler.run_many(urls, wait=True)
|
|
57
|
+
for r in results:
|
|
58
|
+
print(f"{r.url}: {r.success}")
|
|
59
|
+
|
|
60
|
+
# Fire and forget (returns job)
|
|
61
|
+
job = await crawler.run_many(urls, wait=False)
|
|
62
|
+
print(f"Job ID: {job.id}")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Configuration
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from crawl4ai_cloud import CrawlerRunConfig, BrowserConfig
|
|
69
|
+
|
|
70
|
+
config = CrawlerRunConfig(
|
|
71
|
+
word_count_threshold=10,
|
|
72
|
+
exclude_external_links=True,
|
|
73
|
+
screenshot=True,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
browser_config = BrowserConfig(
|
|
77
|
+
viewport_width=1920,
|
|
78
|
+
viewport_height=1080,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
result = await crawler.run(
|
|
82
|
+
"https://example.com",
|
|
83
|
+
config=config,
|
|
84
|
+
browser_config=browser_config,
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Proxy Support
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
# Shorthand
|
|
92
|
+
result = await crawler.run(url, proxy="datacenter")
|
|
93
|
+
result = await crawler.run(url, proxy="residential")
|
|
94
|
+
|
|
95
|
+
# Full config
|
|
96
|
+
result = await crawler.run(url, proxy={
|
|
97
|
+
"mode": "residential",
|
|
98
|
+
"country": "US"
|
|
99
|
+
})
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Deep Crawl
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
result = await crawler.deep_crawl(
|
|
106
|
+
"https://docs.example.com",
|
|
107
|
+
strategy="bfs",
|
|
108
|
+
max_depth=2,
|
|
109
|
+
max_urls=50,
|
|
110
|
+
wait=True,
|
|
111
|
+
)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Job Management
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
# List jobs
|
|
118
|
+
jobs = await crawler.list_jobs(status="completed", limit=10)
|
|
119
|
+
|
|
120
|
+
# Get job status
|
|
121
|
+
job = await crawler.get_job(job_id)
|
|
122
|
+
|
|
123
|
+
# Wait for job
|
|
124
|
+
job = await crawler.wait_job(job_id, poll_interval=2.0)
|
|
125
|
+
|
|
126
|
+
# Cancel job
|
|
127
|
+
await crawler.cancel_job(job_id)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Migration from OSS
|
|
131
|
+
|
|
132
|
+
Zero learning curve — your existing code works:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# Before (OSS)
|
|
136
|
+
from crawl4ai import AsyncWebCrawler
|
|
137
|
+
async with AsyncWebCrawler() as crawler:
|
|
138
|
+
result = await crawler.arun(url)
|
|
139
|
+
|
|
140
|
+
# After (Cloud)
|
|
141
|
+
from crawl4ai_cloud import AsyncWebCrawler
|
|
142
|
+
async with AsyncWebCrawler(api_key="sk_...") as crawler:
|
|
143
|
+
result = await crawler.run(url) # arun() also works!
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Environment Variables
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
export CRAWL4AI_API_KEY=sk_live_...
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# API key auto-loaded from environment
|
|
154
|
+
crawler = AsyncWebCrawler()
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Error Handling
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from crawl4ai_cloud import (
|
|
161
|
+
CloudError,
|
|
162
|
+
AuthenticationError,
|
|
163
|
+
RateLimitError,
|
|
164
|
+
QuotaExceededError,
|
|
165
|
+
NotFoundError,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
result = await crawler.run(url)
|
|
170
|
+
except AuthenticationError:
|
|
171
|
+
print("Invalid API key")
|
|
172
|
+
except RateLimitError as e:
|
|
173
|
+
print(f"Rate limited. Retry after {e.retry_after}s")
|
|
174
|
+
except QuotaExceededError:
|
|
175
|
+
print("Quota exceeded")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Links
|
|
179
|
+
|
|
180
|
+
- [Cloud Dashboard](https://api.crawl4ai.com) - Sign up & get your API key
|
|
181
|
+
- [Cloud API Docs](https://api.crawl4ai.com/docs) - Full API reference
|
|
182
|
+
- [OSS Repository](https://github.com/unclecode/crawl4ai) - Self-hosted option
|
|
183
|
+
- [Discord](https://discord.gg/jP8KfhDhyN) - Community & support
|
|
184
|
+
|
|
185
|
+
## License
|
|
186
|
+
|
|
187
|
+
Apache 2.0
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crawl4AI Cloud SDK - Lightweight cloud client for Crawl4AI API.
|
|
3
|
+
|
|
4
|
+
Example:
|
|
5
|
+
```python
|
|
6
|
+
from crawl4ai_cloud import AsyncWebCrawler, CrawlerRunConfig
|
|
7
|
+
|
|
8
|
+
async with AsyncWebCrawler(api_key="sk_live_xxx") as crawler:
|
|
9
|
+
result = await crawler.run("https://example.com")
|
|
10
|
+
print(result.markdown.raw_markdown)
|
|
11
|
+
```
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__version__ = "0.2.0"
|
|
15
|
+
|
|
16
|
+
# Main crawler class
|
|
17
|
+
from .crawler import AsyncWebCrawler
|
|
18
|
+
|
|
19
|
+
# Configuration classes
|
|
20
|
+
from .configs import (
|
|
21
|
+
CrawlerRunConfig,
|
|
22
|
+
BrowserConfig,
|
|
23
|
+
build_crawl_request,
|
|
24
|
+
sanitize_crawler_config,
|
|
25
|
+
sanitize_browser_config,
|
|
26
|
+
normalize_proxy,
|
|
27
|
+
normalize_url,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Response models
|
|
31
|
+
from .models import (
|
|
32
|
+
CrawlResult,
|
|
33
|
+
CrawlJob,
|
|
34
|
+
JobProgress,
|
|
35
|
+
MarkdownResult,
|
|
36
|
+
DeepCrawlResult,
|
|
37
|
+
ScanUrlInfo,
|
|
38
|
+
ContextResult,
|
|
39
|
+
GeneratedSchema,
|
|
40
|
+
StorageUsage,
|
|
41
|
+
ProxyConfig,
|
|
42
|
+
LLMUsage,
|
|
43
|
+
# Usage metrics
|
|
44
|
+
Usage,
|
|
45
|
+
CrawlUsageMetrics,
|
|
46
|
+
LLMUsageMetrics,
|
|
47
|
+
StorageUsageMetrics,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Errors
|
|
51
|
+
from .errors import (
|
|
52
|
+
CloudError,
|
|
53
|
+
AuthenticationError,
|
|
54
|
+
RateLimitError,
|
|
55
|
+
QuotaExceededError,
|
|
56
|
+
NotFoundError,
|
|
57
|
+
ValidationError,
|
|
58
|
+
TimeoutError,
|
|
59
|
+
ServerError,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
__all__ = [
|
|
63
|
+
# Version
|
|
64
|
+
"__version__",
|
|
65
|
+
# Main class
|
|
66
|
+
"AsyncWebCrawler",
|
|
67
|
+
# Configs
|
|
68
|
+
"CrawlerRunConfig",
|
|
69
|
+
"BrowserConfig",
|
|
70
|
+
"build_crawl_request",
|
|
71
|
+
"sanitize_crawler_config",
|
|
72
|
+
"sanitize_browser_config",
|
|
73
|
+
"normalize_proxy",
|
|
74
|
+
"normalize_url",
|
|
75
|
+
# Models
|
|
76
|
+
"CrawlResult",
|
|
77
|
+
"CrawlJob",
|
|
78
|
+
"JobProgress",
|
|
79
|
+
"MarkdownResult",
|
|
80
|
+
"DeepCrawlResult",
|
|
81
|
+
"ScanUrlInfo",
|
|
82
|
+
"ContextResult",
|
|
83
|
+
"GeneratedSchema",
|
|
84
|
+
"StorageUsage",
|
|
85
|
+
"ProxyConfig",
|
|
86
|
+
"LLMUsage",
|
|
87
|
+
"Usage",
|
|
88
|
+
"CrawlUsageMetrics",
|
|
89
|
+
"LLMUsageMetrics",
|
|
90
|
+
"StorageUsageMetrics",
|
|
91
|
+
# Errors
|
|
92
|
+
"CloudError",
|
|
93
|
+
"AuthenticationError",
|
|
94
|
+
"RateLimitError",
|
|
95
|
+
"QuotaExceededError",
|
|
96
|
+
"NotFoundError",
|
|
97
|
+
"ValidationError",
|
|
98
|
+
"TimeoutError",
|
|
99
|
+
"ServerError",
|
|
100
|
+
]
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Internal HTTP client for Crawl4AI Cloud SDK."""
|
|
2
|
+
import asyncio
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional, Dict, Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from .errors import (
|
|
9
|
+
CloudError,
|
|
10
|
+
AuthenticationError,
|
|
11
|
+
RateLimitError,
|
|
12
|
+
QuotaExceededError,
|
|
13
|
+
NotFoundError,
|
|
14
|
+
ValidationError,
|
|
15
|
+
ServerError,
|
|
16
|
+
TimeoutError,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
DEFAULT_BASE_URL = "https://api.crawl4ai.com"
|
|
22
|
+
DEFAULT_TIMEOUT = 120.0
|
|
23
|
+
DEFAULT_MAX_RETRIES = 3
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HTTPClient:
|
|
27
|
+
"""Internal async HTTP client with retries and error mapping."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
api_key: Optional[str] = None,
|
|
32
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
33
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
34
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Initialize the HTTP client.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
api_key: Your Crawl4AI API key (sk_live_* or sk_test_*).
|
|
41
|
+
If not provided, reads from CRAWL4AI_API_KEY env var.
|
|
42
|
+
base_url: API base URL (default: https://api.crawl4ai.com)
|
|
43
|
+
timeout: Request timeout in seconds (default: 120)
|
|
44
|
+
max_retries: Max retry attempts for transient errors (default: 3)
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If API key is missing or has invalid format
|
|
48
|
+
"""
|
|
49
|
+
self._api_key = api_key or os.getenv("CRAWL4AI_API_KEY")
|
|
50
|
+
|
|
51
|
+
if not self._api_key:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"API key is required. Provide it as an argument or set "
|
|
54
|
+
"the CRAWL4AI_API_KEY environment variable."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if not self._api_key.startswith(("sk_live_", "sk_test_")):
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"Invalid API key format. Expected sk_live_* or sk_test_*"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self._base_url = base_url.rstrip("/")
|
|
63
|
+
self._timeout = timeout
|
|
64
|
+
self._max_retries = max_retries
|
|
65
|
+
self._client: Optional[httpx.AsyncClient] = None
|
|
66
|
+
|
|
67
|
+
async def _get_client(self) -> httpx.AsyncClient:
|
|
68
|
+
"""Get or create the HTTP client."""
|
|
69
|
+
if self._client is None or self._client.is_closed:
|
|
70
|
+
self._client = httpx.AsyncClient(
|
|
71
|
+
base_url=self._base_url,
|
|
72
|
+
headers={
|
|
73
|
+
"X-API-Key": self._api_key,
|
|
74
|
+
"Content-Type": "application/json",
|
|
75
|
+
"User-Agent": f"crawl4ai-cloud/{__version__}",
|
|
76
|
+
},
|
|
77
|
+
timeout=httpx.Timeout(self._timeout),
|
|
78
|
+
)
|
|
79
|
+
return self._client
|
|
80
|
+
|
|
81
|
+
async def request(
|
|
82
|
+
self,
|
|
83
|
+
method: str,
|
|
84
|
+
path: str,
|
|
85
|
+
params: Optional[Dict[str, Any]] = None,
|
|
86
|
+
json: Optional[Dict[str, Any]] = None,
|
|
87
|
+
timeout: Optional[float] = None,
|
|
88
|
+
) -> Dict[str, Any]:
|
|
89
|
+
"""
|
|
90
|
+
Make HTTP request with error handling and retries.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
method: HTTP method (GET, POST, DELETE, etc.)
|
|
94
|
+
path: API endpoint path
|
|
95
|
+
params: Query parameters
|
|
96
|
+
json: JSON body
|
|
97
|
+
timeout: Request timeout override
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Parsed JSON response
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
AuthenticationError: 401 - Invalid API key
|
|
104
|
+
NotFoundError: 404 - Resource not found
|
|
105
|
+
RateLimitError: 429 - Rate limit exceeded
|
|
106
|
+
QuotaExceededError: 429 - Quota exceeded
|
|
107
|
+
ValidationError: 400 - Invalid request
|
|
108
|
+
TimeoutError: 504 or client timeout
|
|
109
|
+
ServerError: 500/503 - Server error
|
|
110
|
+
CloudError: Other errors
|
|
111
|
+
"""
|
|
112
|
+
client = await self._get_client()
|
|
113
|
+
|
|
114
|
+
for attempt in range(self._max_retries):
|
|
115
|
+
try:
|
|
116
|
+
response = await client.request(
|
|
117
|
+
method,
|
|
118
|
+
path,
|
|
119
|
+
params=params,
|
|
120
|
+
json=json,
|
|
121
|
+
timeout=timeout or self._timeout,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Success
|
|
125
|
+
if response.status_code < 400:
|
|
126
|
+
if response.content:
|
|
127
|
+
return response.json()
|
|
128
|
+
return {}
|
|
129
|
+
|
|
130
|
+
# Parse error response
|
|
131
|
+
try:
|
|
132
|
+
error_data = response.json()
|
|
133
|
+
detail = error_data.get("detail", str(error_data))
|
|
134
|
+
except Exception:
|
|
135
|
+
detail = response.text or f"HTTP {response.status_code}"
|
|
136
|
+
error_data = {}
|
|
137
|
+
|
|
138
|
+
headers = {k.lower(): v for k, v in response.headers.items()}
|
|
139
|
+
|
|
140
|
+
# Map status codes to exceptions
|
|
141
|
+
if response.status_code == 401:
|
|
142
|
+
raise AuthenticationError(detail, 401, error_data, headers)
|
|
143
|
+
elif response.status_code == 404:
|
|
144
|
+
raise NotFoundError(detail, 404, error_data, headers)
|
|
145
|
+
elif response.status_code == 429:
|
|
146
|
+
if "rate limit" in detail.lower():
|
|
147
|
+
raise RateLimitError(detail, 429, error_data, headers)
|
|
148
|
+
else:
|
|
149
|
+
raise QuotaExceededError(detail, 429, error_data, headers)
|
|
150
|
+
elif response.status_code == 400:
|
|
151
|
+
raise ValidationError(detail, 400, error_data, headers)
|
|
152
|
+
elif response.status_code == 504:
|
|
153
|
+
raise TimeoutError(detail, 504, error_data, headers)
|
|
154
|
+
elif response.status_code >= 500:
|
|
155
|
+
if attempt < self._max_retries - 1:
|
|
156
|
+
await asyncio.sleep(2 ** attempt)
|
|
157
|
+
continue
|
|
158
|
+
raise ServerError(
|
|
159
|
+
detail, response.status_code, error_data, headers
|
|
160
|
+
)
|
|
161
|
+
else:
|
|
162
|
+
raise CloudError(
|
|
163
|
+
detail, response.status_code, error_data, headers
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
except httpx.TimeoutException as e:
|
|
167
|
+
if attempt < self._max_retries - 1:
|
|
168
|
+
await asyncio.sleep(2 ** attempt)
|
|
169
|
+
continue
|
|
170
|
+
raise TimeoutError(f"Request timed out: {e}")
|
|
171
|
+
|
|
172
|
+
except httpx.RequestError as e:
|
|
173
|
+
if attempt < self._max_retries - 1:
|
|
174
|
+
await asyncio.sleep(2 ** attempt)
|
|
175
|
+
continue
|
|
176
|
+
raise CloudError(f"Request failed: {e}")
|
|
177
|
+
|
|
178
|
+
raise CloudError("Max retries exceeded")
|
|
179
|
+
|
|
180
|
+
async def close(self):
|
|
181
|
+
"""Close the HTTP client."""
|
|
182
|
+
if self._client and not self._client.is_closed:
|
|
183
|
+
await self._client.aclose()
|
|
184
|
+
self._client = None
|
|
185
|
+
|
|
186
|
+
async def __aenter__(self) -> "HTTPClient":
|
|
187
|
+
return self
|
|
188
|
+
|
|
189
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
190
|
+
await self.close()
|