crawl4ai-cloud-sdk 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,216 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawl4ai-cloud-sdk
3
+ Version: 0.2.0
4
+ Summary: Lightweight cloud SDK for Crawl4AI - mirrors the OSS API
5
+ Author-email: Unclecode <unclecode@kidocode.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://api.crawl4ai.com
8
+ Project-URL: Documentation, https://api.crawl4ai.com/docs
9
+ Project-URL: Repository, https://github.com/unclecode/crawl4ai-cloud-sdk
10
+ Project-URL: Issues, https://github.com/unclecode/crawl4ai-cloud-sdk/issues
11
+ Project-URL: Discord, https://discord.gg/jP8KfhDhyN
12
+ Keywords: crawl4ai,web-scraping,crawler,cloud,api,web-crawler,scraping,markdown
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: httpx>=0.27.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
28
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
29
+
30
+ # Crawl4AI Cloud SDK for Python
31
+
32
+ Lightweight Python SDK for [Crawl4AI Cloud](https://api.crawl4ai.com). Mirrors the OSS API exactly.
33
+
34
+ > **Note:** This SDK is for **Crawl4AI Cloud** (api.crawl4ai.com), the managed cloud service. For the self-hosted open-source version, see [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai).
35
+
36
+ [![PyPI version](https://badge.fury.io/py/crawl4ai-cloud.svg)](https://badge.fury.io/py/crawl4ai-cloud)
37
+ [![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai-cloud)](https://pypi.org/project/crawl4ai-cloud/)
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ # From PyPI (coming soon)
43
+ pip install crawl4ai-cloud
44
+
45
+ # From GitHub (available now)
46
+ pip install git+https://github.com/unclecode/crawl4ai-cloud-sdk.git#subdirectory=python
47
+ ```
48
+
49
+ ## Get Your API Key
50
+
51
+ 1. Go to [api.crawl4ai.com](https://api.crawl4ai.com)
52
+ 2. Sign up and get your API key
53
+
54
+ ## Quick Start
55
+
56
+ ```python
57
+ import asyncio
58
+ from crawl4ai_cloud import AsyncWebCrawler
59
+
60
+ async def main():
61
+ async with AsyncWebCrawler(api_key="sk_live_...") as crawler:
62
+ result = await crawler.run("https://example.com")
63
+ print(result.markdown.raw_markdown)
64
+
65
+ asyncio.run(main())
66
+ ```
67
+
68
+ ## Features
69
+
70
+ ### Single URL Crawl
71
+
72
+ ```python
73
+ result = await crawler.run("https://example.com")
74
+ print(result.success)
75
+ print(result.markdown.raw_markdown)
76
+ print(result.html)
77
+ ```
78
+
79
+ ### Batch Crawl
80
+
81
+ ```python
82
+ urls = ["https://example.com", "https://httpbin.org/html"]
83
+
84
+ # Wait for results
85
+ results = await crawler.run_many(urls, wait=True)
86
+ for r in results:
87
+ print(f"{r.url}: {r.success}")
88
+
89
+ # Fire and forget (returns job)
90
+ job = await crawler.run_many(urls, wait=False)
91
+ print(f"Job ID: {job.id}")
92
+ ```
93
+
94
+ ### Configuration
95
+
96
+ ```python
97
+ from crawl4ai_cloud import CrawlerRunConfig, BrowserConfig
98
+
99
+ config = CrawlerRunConfig(
100
+ word_count_threshold=10,
101
+ exclude_external_links=True,
102
+ screenshot=True,
103
+ )
104
+
105
+ browser_config = BrowserConfig(
106
+ viewport_width=1920,
107
+ viewport_height=1080,
108
+ )
109
+
110
+ result = await crawler.run(
111
+ "https://example.com",
112
+ config=config,
113
+ browser_config=browser_config,
114
+ )
115
+ ```
116
+
117
+ ### Proxy Support
118
+
119
+ ```python
120
+ # Shorthand
121
+ result = await crawler.run(url, proxy="datacenter")
122
+ result = await crawler.run(url, proxy="residential")
123
+
124
+ # Full config
125
+ result = await crawler.run(url, proxy={
126
+ "mode": "residential",
127
+ "country": "US"
128
+ })
129
+ ```
130
+
131
+ ### Deep Crawl
132
+
133
+ ```python
134
+ result = await crawler.deep_crawl(
135
+ "https://docs.example.com",
136
+ strategy="bfs",
137
+ max_depth=2,
138
+ max_urls=50,
139
+ wait=True,
140
+ )
141
+ ```
142
+
143
+ ### Job Management
144
+
145
+ ```python
146
+ # List jobs
147
+ jobs = await crawler.list_jobs(status="completed", limit=10)
148
+
149
+ # Get job status
150
+ job = await crawler.get_job(job_id)
151
+
152
+ # Wait for job
153
+ job = await crawler.wait_job(job_id, poll_interval=2.0)
154
+
155
+ # Cancel job
156
+ await crawler.cancel_job(job_id)
157
+ ```
158
+
159
+ ## Migration from OSS
160
+
161
+ Zero learning curve — your existing code works:
162
+
163
+ ```python
164
+ # Before (OSS)
165
+ from crawl4ai import AsyncWebCrawler
166
+ async with AsyncWebCrawler() as crawler:
167
+ result = await crawler.arun(url)
168
+
169
+ # After (Cloud)
170
+ from crawl4ai_cloud import AsyncWebCrawler
171
+ async with AsyncWebCrawler(api_key="sk_...") as crawler:
172
+ result = await crawler.run(url) # arun() also works!
173
+ ```
174
+
175
+ ## Environment Variables
176
+
177
+ ```bash
178
+ export CRAWL4AI_API_KEY=sk_live_...
179
+ ```
180
+
181
+ ```python
182
+ # API key auto-loaded from environment
183
+ crawler = AsyncWebCrawler()
184
+ ```
185
+
186
+ ## Error Handling
187
+
188
+ ```python
189
+ from crawl4ai_cloud import (
190
+ CloudError,
191
+ AuthenticationError,
192
+ RateLimitError,
193
+ QuotaExceededError,
194
+ NotFoundError,
195
+ )
196
+
197
+ try:
198
+ result = await crawler.run(url)
199
+ except AuthenticationError:
200
+ print("Invalid API key")
201
+ except RateLimitError as e:
202
+ print(f"Rate limited. Retry after {e.retry_after}s")
203
+ except QuotaExceededError:
204
+ print("Quota exceeded")
205
+ ```
206
+
207
+ ## Links
208
+
209
+ - [Cloud Dashboard](https://api.crawl4ai.com) - Sign up & get your API key
210
+ - [Cloud API Docs](https://api.crawl4ai.com/docs) - Full API reference
211
+ - [OSS Repository](https://github.com/unclecode/crawl4ai) - Self-hosted option
212
+ - [Discord](https://discord.gg/jP8KfhDhyN) - Community & support
213
+
214
+ ## License
215
+
216
+ Apache 2.0
@@ -0,0 +1,187 @@
1
+ # Crawl4AI Cloud SDK for Python
2
+
3
+ Lightweight Python SDK for [Crawl4AI Cloud](https://api.crawl4ai.com). Mirrors the OSS API exactly.
4
+
5
+ > **Note:** This SDK is for **Crawl4AI Cloud** (api.crawl4ai.com), the managed cloud service. For the self-hosted open-source version, see [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai).
6
+
7
+ [![PyPI version](https://badge.fury.io/py/crawl4ai-cloud.svg)](https://badge.fury.io/py/crawl4ai-cloud)
8
+ [![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai-cloud)](https://pypi.org/project/crawl4ai-cloud/)
9
+
10
+ ## Installation
11
+
12
+ ```bash
13
+ # From PyPI (coming soon)
14
+ pip install crawl4ai-cloud
15
+
16
+ # From GitHub (available now)
17
+ pip install git+https://github.com/unclecode/crawl4ai-cloud-sdk.git#subdirectory=python
18
+ ```
19
+
20
+ ## Get Your API Key
21
+
22
+ 1. Go to [api.crawl4ai.com](https://api.crawl4ai.com)
23
+ 2. Sign up and get your API key
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ import asyncio
29
+ from crawl4ai_cloud import AsyncWebCrawler
30
+
31
+ async def main():
32
+ async with AsyncWebCrawler(api_key="sk_live_...") as crawler:
33
+ result = await crawler.run("https://example.com")
34
+ print(result.markdown.raw_markdown)
35
+
36
+ asyncio.run(main())
37
+ ```
38
+
39
+ ## Features
40
+
41
+ ### Single URL Crawl
42
+
43
+ ```python
44
+ result = await crawler.run("https://example.com")
45
+ print(result.success)
46
+ print(result.markdown.raw_markdown)
47
+ print(result.html)
48
+ ```
49
+
50
+ ### Batch Crawl
51
+
52
+ ```python
53
+ urls = ["https://example.com", "https://httpbin.org/html"]
54
+
55
+ # Wait for results
56
+ results = await crawler.run_many(urls, wait=True)
57
+ for r in results:
58
+ print(f"{r.url}: {r.success}")
59
+
60
+ # Fire and forget (returns job)
61
+ job = await crawler.run_many(urls, wait=False)
62
+ print(f"Job ID: {job.id}")
63
+ ```
64
+
65
+ ### Configuration
66
+
67
+ ```python
68
+ from crawl4ai_cloud import CrawlerRunConfig, BrowserConfig
69
+
70
+ config = CrawlerRunConfig(
71
+ word_count_threshold=10,
72
+ exclude_external_links=True,
73
+ screenshot=True,
74
+ )
75
+
76
+ browser_config = BrowserConfig(
77
+ viewport_width=1920,
78
+ viewport_height=1080,
79
+ )
80
+
81
+ result = await crawler.run(
82
+ "https://example.com",
83
+ config=config,
84
+ browser_config=browser_config,
85
+ )
86
+ ```
87
+
88
+ ### Proxy Support
89
+
90
+ ```python
91
+ # Shorthand
92
+ result = await crawler.run(url, proxy="datacenter")
93
+ result = await crawler.run(url, proxy="residential")
94
+
95
+ # Full config
96
+ result = await crawler.run(url, proxy={
97
+ "mode": "residential",
98
+ "country": "US"
99
+ })
100
+ ```
101
+
102
+ ### Deep Crawl
103
+
104
+ ```python
105
+ result = await crawler.deep_crawl(
106
+ "https://docs.example.com",
107
+ strategy="bfs",
108
+ max_depth=2,
109
+ max_urls=50,
110
+ wait=True,
111
+ )
112
+ ```
113
+
114
+ ### Job Management
115
+
116
+ ```python
117
+ # List jobs
118
+ jobs = await crawler.list_jobs(status="completed", limit=10)
119
+
120
+ # Get job status
121
+ job = await crawler.get_job(job_id)
122
+
123
+ # Wait for job
124
+ job = await crawler.wait_job(job_id, poll_interval=2.0)
125
+
126
+ # Cancel job
127
+ await crawler.cancel_job(job_id)
128
+ ```
129
+
130
+ ## Migration from OSS
131
+
132
+ Zero learning curve — your existing code works:
133
+
134
+ ```python
135
+ # Before (OSS)
136
+ from crawl4ai import AsyncWebCrawler
137
+ async with AsyncWebCrawler() as crawler:
138
+ result = await crawler.arun(url)
139
+
140
+ # After (Cloud)
141
+ from crawl4ai_cloud import AsyncWebCrawler
142
+ async with AsyncWebCrawler(api_key="sk_...") as crawler:
143
+ result = await crawler.run(url) # arun() also works!
144
+ ```
145
+
146
+ ## Environment Variables
147
+
148
+ ```bash
149
+ export CRAWL4AI_API_KEY=sk_live_...
150
+ ```
151
+
152
+ ```python
153
+ # API key auto-loaded from environment
154
+ crawler = AsyncWebCrawler()
155
+ ```
156
+
157
+ ## Error Handling
158
+
159
+ ```python
160
+ from crawl4ai_cloud import (
161
+ CloudError,
162
+ AuthenticationError,
163
+ RateLimitError,
164
+ QuotaExceededError,
165
+ NotFoundError,
166
+ )
167
+
168
+ try:
169
+ result = await crawler.run(url)
170
+ except AuthenticationError:
171
+ print("Invalid API key")
172
+ except RateLimitError as e:
173
+ print(f"Rate limited. Retry after {e.retry_after}s")
174
+ except QuotaExceededError:
175
+ print("Quota exceeded")
176
+ ```
177
+
178
+ ## Links
179
+
180
+ - [Cloud Dashboard](https://api.crawl4ai.com) - Sign up & get your API key
181
+ - [Cloud API Docs](https://api.crawl4ai.com/docs) - Full API reference
182
+ - [OSS Repository](https://github.com/unclecode/crawl4ai) - Self-hosted option
183
+ - [Discord](https://discord.gg/jP8KfhDhyN) - Community & support
184
+
185
+ ## License
186
+
187
+ Apache 2.0
@@ -0,0 +1,100 @@
1
+ """
2
+ Crawl4AI Cloud SDK - Lightweight cloud client for Crawl4AI API.
3
+
4
+ Example:
5
+ ```python
6
+ from crawl4ai_cloud import AsyncWebCrawler, CrawlerRunConfig
7
+
8
+ async with AsyncWebCrawler(api_key="sk_live_xxx") as crawler:
9
+ result = await crawler.run("https://example.com")
10
+ print(result.markdown.raw_markdown)
11
+ ```
12
+ """
13
+
14
+ __version__ = "0.2.0"
15
+
16
+ # Main crawler class
17
+ from .crawler import AsyncWebCrawler
18
+
19
+ # Configuration classes
20
+ from .configs import (
21
+ CrawlerRunConfig,
22
+ BrowserConfig,
23
+ build_crawl_request,
24
+ sanitize_crawler_config,
25
+ sanitize_browser_config,
26
+ normalize_proxy,
27
+ normalize_url,
28
+ )
29
+
30
+ # Response models
31
+ from .models import (
32
+ CrawlResult,
33
+ CrawlJob,
34
+ JobProgress,
35
+ MarkdownResult,
36
+ DeepCrawlResult,
37
+ ScanUrlInfo,
38
+ ContextResult,
39
+ GeneratedSchema,
40
+ StorageUsage,
41
+ ProxyConfig,
42
+ LLMUsage,
43
+ # Usage metrics
44
+ Usage,
45
+ CrawlUsageMetrics,
46
+ LLMUsageMetrics,
47
+ StorageUsageMetrics,
48
+ )
49
+
50
+ # Errors
51
+ from .errors import (
52
+ CloudError,
53
+ AuthenticationError,
54
+ RateLimitError,
55
+ QuotaExceededError,
56
+ NotFoundError,
57
+ ValidationError,
58
+ TimeoutError,
59
+ ServerError,
60
+ )
61
+
62
+ __all__ = [
63
+ # Version
64
+ "__version__",
65
+ # Main class
66
+ "AsyncWebCrawler",
67
+ # Configs
68
+ "CrawlerRunConfig",
69
+ "BrowserConfig",
70
+ "build_crawl_request",
71
+ "sanitize_crawler_config",
72
+ "sanitize_browser_config",
73
+ "normalize_proxy",
74
+ "normalize_url",
75
+ # Models
76
+ "CrawlResult",
77
+ "CrawlJob",
78
+ "JobProgress",
79
+ "MarkdownResult",
80
+ "DeepCrawlResult",
81
+ "ScanUrlInfo",
82
+ "ContextResult",
83
+ "GeneratedSchema",
84
+ "StorageUsage",
85
+ "ProxyConfig",
86
+ "LLMUsage",
87
+ "Usage",
88
+ "CrawlUsageMetrics",
89
+ "LLMUsageMetrics",
90
+ "StorageUsageMetrics",
91
+ # Errors
92
+ "CloudError",
93
+ "AuthenticationError",
94
+ "RateLimitError",
95
+ "QuotaExceededError",
96
+ "NotFoundError",
97
+ "ValidationError",
98
+ "TimeoutError",
99
+ "ServerError",
100
+ ]
@@ -0,0 +1,190 @@
1
+ """Internal HTTP client for Crawl4AI Cloud SDK."""
2
+ import asyncio
3
+ import os
4
+ from typing import Optional, Dict, Any
5
+
6
+ import httpx
7
+
8
+ from .errors import (
9
+ CloudError,
10
+ AuthenticationError,
11
+ RateLimitError,
12
+ QuotaExceededError,
13
+ NotFoundError,
14
+ ValidationError,
15
+ ServerError,
16
+ TimeoutError,
17
+ )
18
+
19
+ __version__ = "0.1.0"
20
+
21
+ DEFAULT_BASE_URL = "https://api.crawl4ai.com"
22
+ DEFAULT_TIMEOUT = 120.0
23
+ DEFAULT_MAX_RETRIES = 3
24
+
25
+
26
+ class HTTPClient:
27
+ """Internal async HTTP client with retries and error mapping."""
28
+
29
+ def __init__(
30
+ self,
31
+ api_key: Optional[str] = None,
32
+ base_url: str = DEFAULT_BASE_URL,
33
+ timeout: float = DEFAULT_TIMEOUT,
34
+ max_retries: int = DEFAULT_MAX_RETRIES,
35
+ ):
36
+ """
37
+ Initialize the HTTP client.
38
+
39
+ Args:
40
+ api_key: Your Crawl4AI API key (sk_live_* or sk_test_*).
41
+ If not provided, reads from CRAWL4AI_API_KEY env var.
42
+ base_url: API base URL (default: https://api.crawl4ai.com)
43
+ timeout: Request timeout in seconds (default: 120)
44
+ max_retries: Max retry attempts for transient errors (default: 3)
45
+
46
+ Raises:
47
+ ValueError: If API key is missing or has invalid format
48
+ """
49
+ self._api_key = api_key or os.getenv("CRAWL4AI_API_KEY")
50
+
51
+ if not self._api_key:
52
+ raise ValueError(
53
+ "API key is required. Provide it as an argument or set "
54
+ "the CRAWL4AI_API_KEY environment variable."
55
+ )
56
+
57
+ if not self._api_key.startswith(("sk_live_", "sk_test_")):
58
+ raise ValueError(
59
+ "Invalid API key format. Expected sk_live_* or sk_test_*"
60
+ )
61
+
62
+ self._base_url = base_url.rstrip("/")
63
+ self._timeout = timeout
64
+ self._max_retries = max_retries
65
+ self._client: Optional[httpx.AsyncClient] = None
66
+
67
+ async def _get_client(self) -> httpx.AsyncClient:
68
+ """Get or create the HTTP client."""
69
+ if self._client is None or self._client.is_closed:
70
+ self._client = httpx.AsyncClient(
71
+ base_url=self._base_url,
72
+ headers={
73
+ "X-API-Key": self._api_key,
74
+ "Content-Type": "application/json",
75
+ "User-Agent": f"crawl4ai-cloud/{__version__}",
76
+ },
77
+ timeout=httpx.Timeout(self._timeout),
78
+ )
79
+ return self._client
80
+
81
+ async def request(
82
+ self,
83
+ method: str,
84
+ path: str,
85
+ params: Optional[Dict[str, Any]] = None,
86
+ json: Optional[Dict[str, Any]] = None,
87
+ timeout: Optional[float] = None,
88
+ ) -> Dict[str, Any]:
89
+ """
90
+ Make HTTP request with error handling and retries.
91
+
92
+ Args:
93
+ method: HTTP method (GET, POST, DELETE, etc.)
94
+ path: API endpoint path
95
+ params: Query parameters
96
+ json: JSON body
97
+ timeout: Request timeout override
98
+
99
+ Returns:
100
+ Parsed JSON response
101
+
102
+ Raises:
103
+ AuthenticationError: 401 - Invalid API key
104
+ NotFoundError: 404 - Resource not found
105
+ RateLimitError: 429 - Rate limit exceeded
106
+ QuotaExceededError: 429 - Quota exceeded
107
+ ValidationError: 400 - Invalid request
108
+ TimeoutError: 504 or client timeout
109
+ ServerError: 500/503 - Server error
110
+ CloudError: Other errors
111
+ """
112
+ client = await self._get_client()
113
+
114
+ for attempt in range(self._max_retries):
115
+ try:
116
+ response = await client.request(
117
+ method,
118
+ path,
119
+ params=params,
120
+ json=json,
121
+ timeout=timeout or self._timeout,
122
+ )
123
+
124
+ # Success
125
+ if response.status_code < 400:
126
+ if response.content:
127
+ return response.json()
128
+ return {}
129
+
130
+ # Parse error response
131
+ try:
132
+ error_data = response.json()
133
+ detail = error_data.get("detail", str(error_data))
134
+ except Exception:
135
+ detail = response.text or f"HTTP {response.status_code}"
136
+ error_data = {}
137
+
138
+ headers = {k.lower(): v for k, v in response.headers.items()}
139
+
140
+ # Map status codes to exceptions
141
+ if response.status_code == 401:
142
+ raise AuthenticationError(detail, 401, error_data, headers)
143
+ elif response.status_code == 404:
144
+ raise NotFoundError(detail, 404, error_data, headers)
145
+ elif response.status_code == 429:
146
+ if "rate limit" in detail.lower():
147
+ raise RateLimitError(detail, 429, error_data, headers)
148
+ else:
149
+ raise QuotaExceededError(detail, 429, error_data, headers)
150
+ elif response.status_code == 400:
151
+ raise ValidationError(detail, 400, error_data, headers)
152
+ elif response.status_code == 504:
153
+ raise TimeoutError(detail, 504, error_data, headers)
154
+ elif response.status_code >= 500:
155
+ if attempt < self._max_retries - 1:
156
+ await asyncio.sleep(2 ** attempt)
157
+ continue
158
+ raise ServerError(
159
+ detail, response.status_code, error_data, headers
160
+ )
161
+ else:
162
+ raise CloudError(
163
+ detail, response.status_code, error_data, headers
164
+ )
165
+
166
+ except httpx.TimeoutException as e:
167
+ if attempt < self._max_retries - 1:
168
+ await asyncio.sleep(2 ** attempt)
169
+ continue
170
+ raise TimeoutError(f"Request timed out: {e}")
171
+
172
+ except httpx.RequestError as e:
173
+ if attempt < self._max_retries - 1:
174
+ await asyncio.sleep(2 ** attempt)
175
+ continue
176
+ raise CloudError(f"Request failed: {e}")
177
+
178
+ raise CloudError("Max retries exceeded")
179
+
180
+ async def close(self):
181
+ """Close the HTTP client."""
182
+ if self._client and not self._client.is_closed:
183
+ await self._client.aclose()
184
+ self._client = None
185
+
186
+ async def __aenter__(self) -> "HTTPClient":
187
+ return self
188
+
189
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
190
+ await self.close()