crawl4ai-cloud-sdk 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawl4ai_cloud/__init__.py +100 -0
- crawl4ai_cloud/_client.py +190 -0
- crawl4ai_cloud/configs.py +523 -0
- crawl4ai_cloud/crawler.py +779 -0
- crawl4ai_cloud/errors.py +91 -0
- crawl4ai_cloud/models.py +502 -0
- crawl4ai_cloud_sdk-0.2.0.dist-info/METADATA +216 -0
- crawl4ai_cloud_sdk-0.2.0.dist-info/RECORD +10 -0
- crawl4ai_cloud_sdk-0.2.0.dist-info/WHEEL +5 -0
- crawl4ai_cloud_sdk-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crawl4AI Cloud SDK - Lightweight cloud client for Crawl4AI API.
|
|
3
|
+
|
|
4
|
+
Example:
|
|
5
|
+
```python
|
|
6
|
+
from crawl4ai_cloud import AsyncWebCrawler, CrawlerRunConfig
|
|
7
|
+
|
|
8
|
+
async with AsyncWebCrawler(api_key="sk_live_xxx") as crawler:
|
|
9
|
+
result = await crawler.run("https://example.com")
|
|
10
|
+
print(result.markdown.raw_markdown)
|
|
11
|
+
```
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__version__ = "0.2.0"
|
|
15
|
+
|
|
16
|
+
# Main crawler class
|
|
17
|
+
from .crawler import AsyncWebCrawler
|
|
18
|
+
|
|
19
|
+
# Configuration classes
|
|
20
|
+
from .configs import (
|
|
21
|
+
CrawlerRunConfig,
|
|
22
|
+
BrowserConfig,
|
|
23
|
+
build_crawl_request,
|
|
24
|
+
sanitize_crawler_config,
|
|
25
|
+
sanitize_browser_config,
|
|
26
|
+
normalize_proxy,
|
|
27
|
+
normalize_url,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Response models
|
|
31
|
+
from .models import (
|
|
32
|
+
CrawlResult,
|
|
33
|
+
CrawlJob,
|
|
34
|
+
JobProgress,
|
|
35
|
+
MarkdownResult,
|
|
36
|
+
DeepCrawlResult,
|
|
37
|
+
ScanUrlInfo,
|
|
38
|
+
ContextResult,
|
|
39
|
+
GeneratedSchema,
|
|
40
|
+
StorageUsage,
|
|
41
|
+
ProxyConfig,
|
|
42
|
+
LLMUsage,
|
|
43
|
+
# Usage metrics
|
|
44
|
+
Usage,
|
|
45
|
+
CrawlUsageMetrics,
|
|
46
|
+
LLMUsageMetrics,
|
|
47
|
+
StorageUsageMetrics,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Errors
|
|
51
|
+
from .errors import (
|
|
52
|
+
CloudError,
|
|
53
|
+
AuthenticationError,
|
|
54
|
+
RateLimitError,
|
|
55
|
+
QuotaExceededError,
|
|
56
|
+
NotFoundError,
|
|
57
|
+
ValidationError,
|
|
58
|
+
TimeoutError,
|
|
59
|
+
ServerError,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
__all__ = [
|
|
63
|
+
# Version
|
|
64
|
+
"__version__",
|
|
65
|
+
# Main class
|
|
66
|
+
"AsyncWebCrawler",
|
|
67
|
+
# Configs
|
|
68
|
+
"CrawlerRunConfig",
|
|
69
|
+
"BrowserConfig",
|
|
70
|
+
"build_crawl_request",
|
|
71
|
+
"sanitize_crawler_config",
|
|
72
|
+
"sanitize_browser_config",
|
|
73
|
+
"normalize_proxy",
|
|
74
|
+
"normalize_url",
|
|
75
|
+
# Models
|
|
76
|
+
"CrawlResult",
|
|
77
|
+
"CrawlJob",
|
|
78
|
+
"JobProgress",
|
|
79
|
+
"MarkdownResult",
|
|
80
|
+
"DeepCrawlResult",
|
|
81
|
+
"ScanUrlInfo",
|
|
82
|
+
"ContextResult",
|
|
83
|
+
"GeneratedSchema",
|
|
84
|
+
"StorageUsage",
|
|
85
|
+
"ProxyConfig",
|
|
86
|
+
"LLMUsage",
|
|
87
|
+
"Usage",
|
|
88
|
+
"CrawlUsageMetrics",
|
|
89
|
+
"LLMUsageMetrics",
|
|
90
|
+
"StorageUsageMetrics",
|
|
91
|
+
# Errors
|
|
92
|
+
"CloudError",
|
|
93
|
+
"AuthenticationError",
|
|
94
|
+
"RateLimitError",
|
|
95
|
+
"QuotaExceededError",
|
|
96
|
+
"NotFoundError",
|
|
97
|
+
"ValidationError",
|
|
98
|
+
"TimeoutError",
|
|
99
|
+
"ServerError",
|
|
100
|
+
]
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Internal HTTP client for Crawl4AI Cloud SDK."""
|
|
2
|
+
import asyncio
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional, Dict, Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from .errors import (
|
|
9
|
+
CloudError,
|
|
10
|
+
AuthenticationError,
|
|
11
|
+
RateLimitError,
|
|
12
|
+
QuotaExceededError,
|
|
13
|
+
NotFoundError,
|
|
14
|
+
ValidationError,
|
|
15
|
+
ServerError,
|
|
16
|
+
TimeoutError,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
DEFAULT_BASE_URL = "https://api.crawl4ai.com"
|
|
22
|
+
DEFAULT_TIMEOUT = 120.0
|
|
23
|
+
DEFAULT_MAX_RETRIES = 3
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HTTPClient:
|
|
27
|
+
"""Internal async HTTP client with retries and error mapping."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
api_key: Optional[str] = None,
|
|
32
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
33
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
34
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Initialize the HTTP client.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
api_key: Your Crawl4AI API key (sk_live_* or sk_test_*).
|
|
41
|
+
If not provided, reads from CRAWL4AI_API_KEY env var.
|
|
42
|
+
base_url: API base URL (default: https://api.crawl4ai.com)
|
|
43
|
+
timeout: Request timeout in seconds (default: 120)
|
|
44
|
+
max_retries: Max retry attempts for transient errors (default: 3)
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If API key is missing or has invalid format
|
|
48
|
+
"""
|
|
49
|
+
self._api_key = api_key or os.getenv("CRAWL4AI_API_KEY")
|
|
50
|
+
|
|
51
|
+
if not self._api_key:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"API key is required. Provide it as an argument or set "
|
|
54
|
+
"the CRAWL4AI_API_KEY environment variable."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if not self._api_key.startswith(("sk_live_", "sk_test_")):
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"Invalid API key format. Expected sk_live_* or sk_test_*"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self._base_url = base_url.rstrip("/")
|
|
63
|
+
self._timeout = timeout
|
|
64
|
+
self._max_retries = max_retries
|
|
65
|
+
self._client: Optional[httpx.AsyncClient] = None
|
|
66
|
+
|
|
67
|
+
async def _get_client(self) -> httpx.AsyncClient:
|
|
68
|
+
"""Get or create the HTTP client."""
|
|
69
|
+
if self._client is None or self._client.is_closed:
|
|
70
|
+
self._client = httpx.AsyncClient(
|
|
71
|
+
base_url=self._base_url,
|
|
72
|
+
headers={
|
|
73
|
+
"X-API-Key": self._api_key,
|
|
74
|
+
"Content-Type": "application/json",
|
|
75
|
+
"User-Agent": f"crawl4ai-cloud/{__version__}",
|
|
76
|
+
},
|
|
77
|
+
timeout=httpx.Timeout(self._timeout),
|
|
78
|
+
)
|
|
79
|
+
return self._client
|
|
80
|
+
|
|
81
|
+
async def request(
|
|
82
|
+
self,
|
|
83
|
+
method: str,
|
|
84
|
+
path: str,
|
|
85
|
+
params: Optional[Dict[str, Any]] = None,
|
|
86
|
+
json: Optional[Dict[str, Any]] = None,
|
|
87
|
+
timeout: Optional[float] = None,
|
|
88
|
+
) -> Dict[str, Any]:
|
|
89
|
+
"""
|
|
90
|
+
Make HTTP request with error handling and retries.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
method: HTTP method (GET, POST, DELETE, etc.)
|
|
94
|
+
path: API endpoint path
|
|
95
|
+
params: Query parameters
|
|
96
|
+
json: JSON body
|
|
97
|
+
timeout: Request timeout override
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Parsed JSON response
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
AuthenticationError: 401 - Invalid API key
|
|
104
|
+
NotFoundError: 404 - Resource not found
|
|
105
|
+
RateLimitError: 429 - Rate limit exceeded
|
|
106
|
+
QuotaExceededError: 429 - Quota exceeded
|
|
107
|
+
ValidationError: 400 - Invalid request
|
|
108
|
+
TimeoutError: 504 or client timeout
|
|
109
|
+
ServerError: 500/503 - Server error
|
|
110
|
+
CloudError: Other errors
|
|
111
|
+
"""
|
|
112
|
+
client = await self._get_client()
|
|
113
|
+
|
|
114
|
+
for attempt in range(self._max_retries):
|
|
115
|
+
try:
|
|
116
|
+
response = await client.request(
|
|
117
|
+
method,
|
|
118
|
+
path,
|
|
119
|
+
params=params,
|
|
120
|
+
json=json,
|
|
121
|
+
timeout=timeout or self._timeout,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Success
|
|
125
|
+
if response.status_code < 400:
|
|
126
|
+
if response.content:
|
|
127
|
+
return response.json()
|
|
128
|
+
return {}
|
|
129
|
+
|
|
130
|
+
# Parse error response
|
|
131
|
+
try:
|
|
132
|
+
error_data = response.json()
|
|
133
|
+
detail = error_data.get("detail", str(error_data))
|
|
134
|
+
except Exception:
|
|
135
|
+
detail = response.text or f"HTTP {response.status_code}"
|
|
136
|
+
error_data = {}
|
|
137
|
+
|
|
138
|
+
headers = {k.lower(): v for k, v in response.headers.items()}
|
|
139
|
+
|
|
140
|
+
# Map status codes to exceptions
|
|
141
|
+
if response.status_code == 401:
|
|
142
|
+
raise AuthenticationError(detail, 401, error_data, headers)
|
|
143
|
+
elif response.status_code == 404:
|
|
144
|
+
raise NotFoundError(detail, 404, error_data, headers)
|
|
145
|
+
elif response.status_code == 429:
|
|
146
|
+
if "rate limit" in detail.lower():
|
|
147
|
+
raise RateLimitError(detail, 429, error_data, headers)
|
|
148
|
+
else:
|
|
149
|
+
raise QuotaExceededError(detail, 429, error_data, headers)
|
|
150
|
+
elif response.status_code == 400:
|
|
151
|
+
raise ValidationError(detail, 400, error_data, headers)
|
|
152
|
+
elif response.status_code == 504:
|
|
153
|
+
raise TimeoutError(detail, 504, error_data, headers)
|
|
154
|
+
elif response.status_code >= 500:
|
|
155
|
+
if attempt < self._max_retries - 1:
|
|
156
|
+
await asyncio.sleep(2 ** attempt)
|
|
157
|
+
continue
|
|
158
|
+
raise ServerError(
|
|
159
|
+
detail, response.status_code, error_data, headers
|
|
160
|
+
)
|
|
161
|
+
else:
|
|
162
|
+
raise CloudError(
|
|
163
|
+
detail, response.status_code, error_data, headers
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
except httpx.TimeoutException as e:
|
|
167
|
+
if attempt < self._max_retries - 1:
|
|
168
|
+
await asyncio.sleep(2 ** attempt)
|
|
169
|
+
continue
|
|
170
|
+
raise TimeoutError(f"Request timed out: {e}")
|
|
171
|
+
|
|
172
|
+
except httpx.RequestError as e:
|
|
173
|
+
if attempt < self._max_retries - 1:
|
|
174
|
+
await asyncio.sleep(2 ** attempt)
|
|
175
|
+
continue
|
|
176
|
+
raise CloudError(f"Request failed: {e}")
|
|
177
|
+
|
|
178
|
+
raise CloudError("Max retries exceeded")
|
|
179
|
+
|
|
180
|
+
async def close(self):
|
|
181
|
+
"""Close the HTTP client."""
|
|
182
|
+
if self._client and not self._client.is_closed:
|
|
183
|
+
await self._client.aclose()
|
|
184
|
+
self._client = None
|
|
185
|
+
|
|
186
|
+
async def __aenter__(self) -> "HTTPClient":
|
|
187
|
+
return self
|
|
188
|
+
|
|
189
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
190
|
+
await self.close()
|