firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +4653 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
- firecrawl-3.0.3.dist-info/RECORD +78 -0
- tests/test_timeout_conversion.py +117 -0
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5.dist-info/RECORD +0 -12
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Async v2 client mirroring the regular client surface using true async HTTP transport.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import asyncio
|
|
7
|
+
from typing import Optional, List, Dict, Any, Union, Callable, Literal
|
|
8
|
+
from .types import (
|
|
9
|
+
ScrapeOptions,
|
|
10
|
+
CrawlRequest,
|
|
11
|
+
WebhookConfig,
|
|
12
|
+
SearchRequest,
|
|
13
|
+
SearchData,
|
|
14
|
+
SourceOption,
|
|
15
|
+
CrawlResponse,
|
|
16
|
+
CrawlJob,
|
|
17
|
+
CrawlParamsRequest,
|
|
18
|
+
CrawlParamsData,
|
|
19
|
+
CrawlErrorsResponse,
|
|
20
|
+
ActiveCrawlsResponse,
|
|
21
|
+
MapOptions,
|
|
22
|
+
MapData,
|
|
23
|
+
FormatOption,
|
|
24
|
+
WaitAction,
|
|
25
|
+
ScreenshotAction,
|
|
26
|
+
ClickAction,
|
|
27
|
+
WriteAction,
|
|
28
|
+
PressAction,
|
|
29
|
+
ScrollAction,
|
|
30
|
+
ScrapeAction,
|
|
31
|
+
ExecuteJavascriptAction,
|
|
32
|
+
PDFAction,
|
|
33
|
+
Location,
|
|
34
|
+
)
|
|
35
|
+
from .utils.http_client import HttpClient
|
|
36
|
+
from .utils.http_client_async import AsyncHttpClient
|
|
37
|
+
|
|
38
|
+
from .methods.aio import scrape as async_scrape # type: ignore[attr-defined]
|
|
39
|
+
from .methods.aio import batch as async_batch # type: ignore[attr-defined]
|
|
40
|
+
from .methods.aio import crawl as async_crawl # type: ignore[attr-defined]
|
|
41
|
+
from .methods.aio import search as async_search # type: ignore[attr-defined]
|
|
42
|
+
from .methods.aio import map as async_map # type: ignore[attr-defined]
|
|
43
|
+
from .methods.aio import usage as async_usage # type: ignore[attr-defined]
|
|
44
|
+
from .methods.aio import extract as async_extract # type: ignore[attr-defined]
|
|
45
|
+
|
|
46
|
+
from .watcher_async import AsyncWatcher
|
|
47
|
+
|
|
48
|
+
class AsyncFirecrawlClient:
|
|
49
|
+
def __init__(self, api_key: Optional[str] = None, api_url: str = "https://api.firecrawl.dev"):
|
|
50
|
+
if api_key is None:
|
|
51
|
+
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
52
|
+
if not api_key:
|
|
53
|
+
raise ValueError("API key is required. Set FIRECRAWL_API_KEY or pass api_key.")
|
|
54
|
+
self.http_client = HttpClient(api_key, api_url)
|
|
55
|
+
self.async_http_client = AsyncHttpClient(api_key, api_url)
|
|
56
|
+
|
|
57
|
+
# Scrape
|
|
58
|
+
async def scrape(
|
|
59
|
+
self,
|
|
60
|
+
url: str,
|
|
61
|
+
**kwargs,
|
|
62
|
+
):
|
|
63
|
+
options = ScrapeOptions(**{k: v for k, v in kwargs.items() if v is not None}) if kwargs else None
|
|
64
|
+
return await async_scrape.scrape(self.async_http_client, url, options)
|
|
65
|
+
|
|
66
|
+
# Search
|
|
67
|
+
async def search(
|
|
68
|
+
self,
|
|
69
|
+
query: str,
|
|
70
|
+
**kwargs,
|
|
71
|
+
) -> SearchData:
|
|
72
|
+
request = SearchRequest(query=query, **{k: v for k, v in kwargs.items() if v is not None})
|
|
73
|
+
return await async_search.search(self.async_http_client, request)
|
|
74
|
+
|
|
75
|
+
async def start_crawl(self, url: str, **kwargs) -> CrawlResponse:
|
|
76
|
+
request = CrawlRequest(url=url, **kwargs)
|
|
77
|
+
return await async_crawl.start_crawl(self.async_http_client, request)
|
|
78
|
+
|
|
79
|
+
async def wait_crawl(self, job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> CrawlJob:
|
|
80
|
+
# simple polling loop using blocking get (ok for test-level async)
|
|
81
|
+
start = asyncio.get_event_loop().time()
|
|
82
|
+
while True:
|
|
83
|
+
status = await async_crawl.get_crawl_status(self.async_http_client, job_id)
|
|
84
|
+
if status.status in ["completed", "failed"]:
|
|
85
|
+
return status
|
|
86
|
+
if timeout and (asyncio.get_event_loop().time() - start) > timeout:
|
|
87
|
+
raise TimeoutError("Crawl wait timed out")
|
|
88
|
+
await asyncio.sleep(poll_interval)
|
|
89
|
+
|
|
90
|
+
async def crawl(self, **kwargs) -> CrawlJob:
|
|
91
|
+
# wrapper combining start and wait
|
|
92
|
+
resp = await self.start_crawl(**{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout")})
|
|
93
|
+
poll_interval = kwargs.get("poll_interval", 2)
|
|
94
|
+
timeout = kwargs.get("timeout")
|
|
95
|
+
return await self.wait_crawl(resp.id, poll_interval=poll_interval, timeout=timeout)
|
|
96
|
+
|
|
97
|
+
async def get_crawl_status(self, job_id: str) -> CrawlJob:
|
|
98
|
+
return await async_crawl.get_crawl_status(self.async_http_client, job_id)
|
|
99
|
+
|
|
100
|
+
async def cancel_crawl(self, job_id: str) -> bool:
|
|
101
|
+
return await async_crawl.cancel_crawl(self.async_http_client, job_id)
|
|
102
|
+
|
|
103
|
+
async def crawl_params_preview(self, url: str, prompt: str) -> CrawlParamsData:
|
|
104
|
+
req = CrawlParamsRequest(url=url, prompt=prompt)
|
|
105
|
+
return await async_crawl.crawl_params_preview(self.async_http_client, req)
|
|
106
|
+
|
|
107
|
+
async def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
|
|
108
|
+
return await async_crawl.get_crawl_errors(self.async_http_client, crawl_id)
|
|
109
|
+
|
|
110
|
+
async def get_active_crawls(self) -> ActiveCrawlsResponse:
|
|
111
|
+
return await async_crawl.get_active_crawls(self.async_http_client)
|
|
112
|
+
|
|
113
|
+
async def active_crawls(self) -> ActiveCrawlsResponse:
|
|
114
|
+
return await self.get_active_crawls()
|
|
115
|
+
|
|
116
|
+
# Map
|
|
117
|
+
async def map(
|
|
118
|
+
self,
|
|
119
|
+
url: str,
|
|
120
|
+
*,
|
|
121
|
+
search: Optional[str] = None,
|
|
122
|
+
include_subdomains: Optional[bool] = None,
|
|
123
|
+
limit: Optional[int] = None,
|
|
124
|
+
sitemap: Optional[Literal["only", "include", "skip"]] = None,
|
|
125
|
+
timeout: Optional[int] = None,
|
|
126
|
+
) -> MapData:
|
|
127
|
+
options = MapOptions(
|
|
128
|
+
search=search,
|
|
129
|
+
include_subdomains=include_subdomains,
|
|
130
|
+
limit=limit,
|
|
131
|
+
sitemap=sitemap if sitemap is not None else "include",
|
|
132
|
+
timeout=timeout,
|
|
133
|
+
) if any(v is not None for v in [search, include_subdomains, limit, sitemap, timeout]) else None
|
|
134
|
+
return await async_map.map(self.async_http_client, url, options)
|
|
135
|
+
|
|
136
|
+
async def start_batch_scrape(self, urls: List[str], **kwargs) -> Any:
|
|
137
|
+
return await async_batch.start_batch_scrape(self.async_http_client, urls, **kwargs)
|
|
138
|
+
|
|
139
|
+
async def wait_batch_scrape(self, job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> Any:
|
|
140
|
+
start = asyncio.get_event_loop().time()
|
|
141
|
+
while True:
|
|
142
|
+
status = await async_batch.get_batch_scrape_status(self.async_http_client, job_id)
|
|
143
|
+
if status.status in ["completed", "failed", "cancelled"]:
|
|
144
|
+
return status
|
|
145
|
+
if timeout and (asyncio.get_event_loop().time() - start) > timeout:
|
|
146
|
+
raise TimeoutError("Batch wait timed out")
|
|
147
|
+
await asyncio.sleep(poll_interval)
|
|
148
|
+
|
|
149
|
+
async def batch_scrape(self, urls: List[str], **kwargs) -> Any:
|
|
150
|
+
# waiter wrapper
|
|
151
|
+
start = await self.start_batch_scrape(urls, **{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout")})
|
|
152
|
+
job_id = start.id
|
|
153
|
+
poll_interval = kwargs.get("poll_interval", 2)
|
|
154
|
+
timeout = kwargs.get("timeout")
|
|
155
|
+
return await self.wait_batch_scrape(job_id, poll_interval=poll_interval, timeout=timeout)
|
|
156
|
+
|
|
157
|
+
async def get_batch_scrape_status(self, job_id: str):
|
|
158
|
+
return await async_batch.get_batch_scrape_status(self.async_http_client, job_id)
|
|
159
|
+
|
|
160
|
+
async def cancel_batch_scrape(self, job_id: str) -> bool:
|
|
161
|
+
return await async_batch.cancel_batch_scrape(self.async_http_client, job_id)
|
|
162
|
+
|
|
163
|
+
async def get_batch_scrape_errors(self, job_id: str) -> CrawlErrorsResponse:
|
|
164
|
+
# Returns v2 errors structure; typed as CrawlErrorsResponse for parity
|
|
165
|
+
return await async_batch.get_batch_scrape_errors(self.async_http_client, job_id) # type: ignore[return-value]
|
|
166
|
+
|
|
167
|
+
# Extract (proxy to v1 async)
|
|
168
|
+
async def extract(
|
|
169
|
+
self,
|
|
170
|
+
urls: Optional[List[str]] = None,
|
|
171
|
+
*,
|
|
172
|
+
prompt: Optional[str] = None,
|
|
173
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
174
|
+
system_prompt: Optional[str] = None,
|
|
175
|
+
allow_external_links: Optional[bool] = None,
|
|
176
|
+
enable_web_search: Optional[bool] = None,
|
|
177
|
+
show_sources: Optional[bool] = None,
|
|
178
|
+
scrape_options: Optional['ScrapeOptions'] = None,
|
|
179
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
180
|
+
poll_interval: int = 2,
|
|
181
|
+
timeout: Optional[int] = None,
|
|
182
|
+
):
|
|
183
|
+
return await async_extract.extract(
|
|
184
|
+
self.async_http_client,
|
|
185
|
+
urls,
|
|
186
|
+
prompt=prompt,
|
|
187
|
+
schema=schema,
|
|
188
|
+
system_prompt=system_prompt,
|
|
189
|
+
allow_external_links=allow_external_links,
|
|
190
|
+
enable_web_search=enable_web_search,
|
|
191
|
+
show_sources=show_sources,
|
|
192
|
+
scrape_options=scrape_options,
|
|
193
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
194
|
+
poll_interval=poll_interval,
|
|
195
|
+
timeout=timeout,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
async def get_extract_status(self, job_id: str):
|
|
199
|
+
return await async_extract.get_extract_status(self.async_http_client, job_id)
|
|
200
|
+
|
|
201
|
+
async def start_extract(
|
|
202
|
+
self,
|
|
203
|
+
urls: Optional[List[str]] = None,
|
|
204
|
+
*,
|
|
205
|
+
prompt: Optional[str] = None,
|
|
206
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
207
|
+
system_prompt: Optional[str] = None,
|
|
208
|
+
allow_external_links: Optional[bool] = None,
|
|
209
|
+
enable_web_search: Optional[bool] = None,
|
|
210
|
+
show_sources: Optional[bool] = None,
|
|
211
|
+
scrape_options: Optional['ScrapeOptions'] = None,
|
|
212
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
213
|
+
):
|
|
214
|
+
return await async_extract.start_extract(
|
|
215
|
+
self.async_http_client,
|
|
216
|
+
urls,
|
|
217
|
+
prompt=prompt,
|
|
218
|
+
schema=schema,
|
|
219
|
+
system_prompt=system_prompt,
|
|
220
|
+
allow_external_links=allow_external_links,
|
|
221
|
+
enable_web_search=enable_web_search,
|
|
222
|
+
show_sources=show_sources,
|
|
223
|
+
scrape_options=scrape_options,
|
|
224
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Usage endpoints
|
|
228
|
+
async def get_concurrency(self):
|
|
229
|
+
from .methods.aio import usage as async_usage # type: ignore[attr-defined]
|
|
230
|
+
return await async_usage.get_concurrency(self.async_http_client)
|
|
231
|
+
|
|
232
|
+
async def get_credit_usage(self):
|
|
233
|
+
from .methods.aio import usage as async_usage # type: ignore[attr-defined]
|
|
234
|
+
return await async_usage.get_credit_usage(self.async_http_client)
|
|
235
|
+
|
|
236
|
+
async def get_token_usage(self):
|
|
237
|
+
from .methods.aio import usage as async_usage # type: ignore[attr-defined]
|
|
238
|
+
return await async_usage.get_token_usage(self.async_http_client)
|
|
239
|
+
|
|
240
|
+
# Watcher (sync object usable from async contexts)
|
|
241
|
+
def watcher(
|
|
242
|
+
self,
|
|
243
|
+
job_id: str,
|
|
244
|
+
*,
|
|
245
|
+
kind: Literal["crawl", "batch"] = "crawl",
|
|
246
|
+
poll_interval: int = 2,
|
|
247
|
+
timeout: Optional[int] = None,
|
|
248
|
+
) -> AsyncWatcher:
|
|
249
|
+
return AsyncWatcher(self, job_id, kind=kind, poll_interval=poll_interval, timeout=timeout)
|
|
250
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Async (aio) method modules for v2
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
|
2
|
+
from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob
|
|
3
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
4
|
+
from ...utils.validation import prepare_scrape_options
|
|
5
|
+
from ...utils.error_handler import handle_response_error
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _prepare(urls: List[str], *, options: Optional[ScrapeOptions] = None, **kwargs) -> Dict[str, Any]:
|
|
9
|
+
if not urls:
|
|
10
|
+
raise ValueError("URLs list cannot be empty")
|
|
11
|
+
payload: Dict[str, Any] = {"urls": [u.strip() for u in urls]}
|
|
12
|
+
if options:
|
|
13
|
+
opts = prepare_scrape_options(options)
|
|
14
|
+
if opts:
|
|
15
|
+
payload.update(opts)
|
|
16
|
+
if (w := kwargs.get("webhook")) is not None:
|
|
17
|
+
payload["webhook"] = w if isinstance(w, str) else w.model_dump(exclude_none=True)
|
|
18
|
+
if (v := kwargs.get("append_to_id")) is not None:
|
|
19
|
+
payload["appendToId"] = v
|
|
20
|
+
if (v := kwargs.get("ignore_invalid_urls")) is not None:
|
|
21
|
+
payload["ignoreInvalidURLs"] = v
|
|
22
|
+
if (v := kwargs.get("max_concurrency")) is not None:
|
|
23
|
+
payload["maxConcurrency"] = v
|
|
24
|
+
if (v := kwargs.get("zero_data_retention")) is not None:
|
|
25
|
+
payload["zeroDataRetention"] = v
|
|
26
|
+
if (v := kwargs.get("integration")) is not None:
|
|
27
|
+
payload["integration"] = v
|
|
28
|
+
return payload
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def start_batch_scrape(client: AsyncHttpClient, urls: List[str], **kwargs) -> BatchScrapeResponse:
|
|
32
|
+
payload = _prepare(urls, **kwargs)
|
|
33
|
+
response = await client.post("/v2/batch/scrape", payload)
|
|
34
|
+
if response.status_code >= 400:
|
|
35
|
+
handle_response_error(response, "start batch scrape")
|
|
36
|
+
body = response.json()
|
|
37
|
+
if not body.get("success"):
|
|
38
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
39
|
+
return BatchScrapeResponse(id=body.get("id"), url=body.get("url"), invalid_urls=body.get("invalidURLs"))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def get_batch_scrape_status(client: AsyncHttpClient, job_id: str) -> BatchScrapeJob:
|
|
43
|
+
response = await client.get(f"/v2/batch/scrape/{job_id}")
|
|
44
|
+
if response.status_code >= 400:
|
|
45
|
+
handle_response_error(response, "get batch scrape status")
|
|
46
|
+
body = response.json()
|
|
47
|
+
if not body.get("success"):
|
|
48
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
49
|
+
docs: List[Document] = []
|
|
50
|
+
for doc in body.get("data", []) or []:
|
|
51
|
+
if isinstance(doc, dict):
|
|
52
|
+
normalized = dict(doc)
|
|
53
|
+
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
54
|
+
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
55
|
+
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
56
|
+
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
57
|
+
docs.append(Document(**normalized))
|
|
58
|
+
return BatchScrapeJob(
|
|
59
|
+
status=body.get("status"),
|
|
60
|
+
completed=body.get("completed", 0),
|
|
61
|
+
total=body.get("total", 0),
|
|
62
|
+
credits_used=body.get("creditsUsed"),
|
|
63
|
+
expires_at=body.get("expiresAt"),
|
|
64
|
+
next=body.get("next"),
|
|
65
|
+
data=docs,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
async def cancel_batch_scrape(client: AsyncHttpClient, job_id: str) -> bool:
|
|
70
|
+
response = await client.delete(f"/v2/batch/scrape/{job_id}")
|
|
71
|
+
if response.status_code >= 400:
|
|
72
|
+
handle_response_error(response, "cancel batch scrape")
|
|
73
|
+
body = response.json()
|
|
74
|
+
return body.get("status") == "cancelled"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def get_batch_scrape_errors(client: AsyncHttpClient, job_id: str) -> Dict[str, Any]:
|
|
78
|
+
response = await client.get(f"/v2/batch/scrape/{job_id}/errors")
|
|
79
|
+
if response.status_code >= 400:
|
|
80
|
+
handle_response_error(response, "get batch scrape errors")
|
|
81
|
+
body = response.json()
|
|
82
|
+
if not body.get("success"):
|
|
83
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
84
|
+
return body
|
|
85
|
+
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any
|
|
2
|
+
from ...types import (
|
|
3
|
+
CrawlRequest,
|
|
4
|
+
CrawlJob,
|
|
5
|
+
CrawlResponse,
|
|
6
|
+
Document,
|
|
7
|
+
CrawlParamsRequest,
|
|
8
|
+
CrawlParamsData,
|
|
9
|
+
WebhookConfig,
|
|
10
|
+
CrawlErrorsResponse,
|
|
11
|
+
ActiveCrawlsResponse,
|
|
12
|
+
ActiveCrawl,
|
|
13
|
+
)
|
|
14
|
+
from ...utils.error_handler import handle_response_error
|
|
15
|
+
from ...utils.validation import prepare_scrape_options
|
|
16
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
20
|
+
if not request.url or not request.url.strip():
|
|
21
|
+
raise ValueError("URL cannot be empty")
|
|
22
|
+
data = {"url": request.url}
|
|
23
|
+
if request.prompt:
|
|
24
|
+
data["prompt"] = request.prompt
|
|
25
|
+
if request.scrape_options is not None:
|
|
26
|
+
opts = prepare_scrape_options(request.scrape_options)
|
|
27
|
+
if opts:
|
|
28
|
+
data["scrapeOptions"] = opts
|
|
29
|
+
# Webhook conversion
|
|
30
|
+
if request.webhook is not None:
|
|
31
|
+
if isinstance(request.webhook, str):
|
|
32
|
+
data["webhook"] = request.webhook
|
|
33
|
+
else:
|
|
34
|
+
data["webhook"] = request.webhook.model_dump(exclude_none=True)
|
|
35
|
+
request_data = request.model_dump(exclude_none=True, exclude_unset=True)
|
|
36
|
+
request_data.pop("url", None)
|
|
37
|
+
request_data.pop("prompt", None)
|
|
38
|
+
request_data.pop("scrape_options", None)
|
|
39
|
+
field_mappings = {
|
|
40
|
+
"include_paths": "includePaths",
|
|
41
|
+
"exclude_paths": "excludePaths",
|
|
42
|
+
"max_discovery_depth": "maxDiscoveryDepth",
|
|
43
|
+
"ignore_sitemap": "ignoreSitemap",
|
|
44
|
+
"ignore_query_parameters": "ignoreQueryParameters",
|
|
45
|
+
"crawl_entire_domain": "crawlEntireDomain",
|
|
46
|
+
"allow_external_links": "allowExternalLinks",
|
|
47
|
+
"allow_subdomains": "allowSubdomains",
|
|
48
|
+
"delay": "delay",
|
|
49
|
+
"max_concurrency": "maxConcurrency",
|
|
50
|
+
"zero_data_retention": "zeroDataRetention",
|
|
51
|
+
}
|
|
52
|
+
for snake, camel in field_mappings.items():
|
|
53
|
+
if snake in request_data:
|
|
54
|
+
data[camel] = request_data.pop(snake)
|
|
55
|
+
data.update(request_data)
|
|
56
|
+
return data
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlResponse:
|
|
60
|
+
payload = _prepare_crawl_request(request)
|
|
61
|
+
response = await client.post("/v2/crawl", payload)
|
|
62
|
+
if response.status_code >= 400:
|
|
63
|
+
handle_response_error(response, "start crawl")
|
|
64
|
+
body = response.json()
|
|
65
|
+
if body.get("success"):
|
|
66
|
+
return CrawlResponse(id=body.get("id"), url=body.get("url"))
|
|
67
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
|
|
71
|
+
response = await client.get(f"/v2/crawl/{job_id}")
|
|
72
|
+
if response.status_code >= 400:
|
|
73
|
+
handle_response_error(response, "get crawl status")
|
|
74
|
+
body = response.json()
|
|
75
|
+
if body.get("success"):
|
|
76
|
+
documents = []
|
|
77
|
+
for doc_data in body.get("data", []):
|
|
78
|
+
if isinstance(doc_data, dict):
|
|
79
|
+
normalized = dict(doc_data)
|
|
80
|
+
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
81
|
+
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
82
|
+
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
83
|
+
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
84
|
+
documents.append(Document(**normalized))
|
|
85
|
+
return CrawlJob(
|
|
86
|
+
status=body.get("status"),
|
|
87
|
+
completed=body.get("completed", 0),
|
|
88
|
+
total=body.get("total", 0),
|
|
89
|
+
credits_used=body.get("creditsUsed", 0),
|
|
90
|
+
expires_at=body.get("expiresAt"),
|
|
91
|
+
next=body.get("next"),
|
|
92
|
+
data=documents,
|
|
93
|
+
)
|
|
94
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
|
|
98
|
+
response = await client.delete(f"/v2/crawl/{job_id}")
|
|
99
|
+
if response.status_code >= 400:
|
|
100
|
+
handle_response_error(response, "cancel crawl")
|
|
101
|
+
body = response.json()
|
|
102
|
+
return body.get("status") == "cancelled"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
|
|
106
|
+
if not request.url or not request.url.strip():
|
|
107
|
+
raise ValueError("URL cannot be empty")
|
|
108
|
+
if not request.prompt or not request.prompt.strip():
|
|
109
|
+
raise ValueError("Prompt cannot be empty")
|
|
110
|
+
payload = {"url": request.url, "prompt": request.prompt}
|
|
111
|
+
response = await client.post("/v2/crawl/params-preview", payload)
|
|
112
|
+
if response.status_code >= 400:
|
|
113
|
+
handle_response_error(response, "crawl params preview")
|
|
114
|
+
body = response.json()
|
|
115
|
+
if not body.get("success"):
|
|
116
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
117
|
+
params_data = body.get("data", {})
|
|
118
|
+
converted: Dict[str, Any] = {}
|
|
119
|
+
mapping = {
|
|
120
|
+
"includePaths": "include_paths",
|
|
121
|
+
"excludePaths": "exclude_paths",
|
|
122
|
+
"maxDiscoveryDepth": "max_discovery_depth",
|
|
123
|
+
"ignoreSitemap": "ignore_sitemap",
|
|
124
|
+
"ignoreQueryParameters": "ignore_query_parameters",
|
|
125
|
+
"crawlEntireDomain": "crawl_entire_domain",
|
|
126
|
+
"allowExternalLinks": "allow_external_links",
|
|
127
|
+
"allowSubdomains": "allow_subdomains",
|
|
128
|
+
"maxConcurrency": "max_concurrency",
|
|
129
|
+
"scrapeOptions": "scrape_options",
|
|
130
|
+
"zeroDataRetention": "zero_data_retention",
|
|
131
|
+
}
|
|
132
|
+
for camel, snake in mapping.items():
|
|
133
|
+
if camel in params_data:
|
|
134
|
+
converted[snake] = params_data[camel]
|
|
135
|
+
if "webhook" in params_data:
|
|
136
|
+
wk = params_data["webhook"]
|
|
137
|
+
converted["webhook"] = wk
|
|
138
|
+
if "warning" in body:
|
|
139
|
+
converted["warning"] = body["warning"]
|
|
140
|
+
return CrawlParamsData(**converted)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlErrorsResponse:
|
|
144
|
+
response = await client.get(f"/v2/crawl/{crawl_id}/errors")
|
|
145
|
+
if response.status_code >= 400:
|
|
146
|
+
handle_response_error(response, "check crawl errors")
|
|
147
|
+
body = response.json()
|
|
148
|
+
payload = body.get("data", body)
|
|
149
|
+
normalized = {
|
|
150
|
+
"errors": payload.get("errors", []),
|
|
151
|
+
"robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
|
|
152
|
+
}
|
|
153
|
+
return CrawlErrorsResponse(**normalized)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
async def get_active_crawls(client: AsyncHttpClient) -> ActiveCrawlsResponse:
|
|
157
|
+
response = await client.get("/v2/crawl/active")
|
|
158
|
+
if response.status_code >= 400:
|
|
159
|
+
handle_response_error(response, "get active crawls")
|
|
160
|
+
body = response.json()
|
|
161
|
+
if not body.get("success"):
|
|
162
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
163
|
+
crawls_in = body.get("crawls", [])
|
|
164
|
+
normalized = []
|
|
165
|
+
for c in crawls_in:
|
|
166
|
+
if isinstance(c, dict):
|
|
167
|
+
normalized.append({
|
|
168
|
+
"id": c.get("id"),
|
|
169
|
+
"team_id": c.get("teamId", c.get("team_id")),
|
|
170
|
+
"url": c.get("url"),
|
|
171
|
+
"options": c.get("options"),
|
|
172
|
+
})
|
|
173
|
+
return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized])
|
|
174
|
+
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
import asyncio
|
|
3
|
+
|
|
4
|
+
from ...types import ExtractResponse, ScrapeOptions
|
|
5
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
6
|
+
from ...utils.validation import prepare_scrape_options
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _prepare_extract_request(
|
|
10
|
+
urls: Optional[List[str]],
|
|
11
|
+
*,
|
|
12
|
+
prompt: Optional[str] = None,
|
|
13
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
14
|
+
system_prompt: Optional[str] = None,
|
|
15
|
+
allow_external_links: Optional[bool] = None,
|
|
16
|
+
enable_web_search: Optional[bool] = None,
|
|
17
|
+
show_sources: Optional[bool] = None,
|
|
18
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
19
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
20
|
+
) -> Dict[str, Any]:
|
|
21
|
+
body: Dict[str, Any] = {}
|
|
22
|
+
if urls is not None:
|
|
23
|
+
body["urls"] = urls
|
|
24
|
+
if prompt is not None:
|
|
25
|
+
body["prompt"] = prompt
|
|
26
|
+
if schema is not None:
|
|
27
|
+
body["schema"] = schema
|
|
28
|
+
if system_prompt is not None:
|
|
29
|
+
body["systemPrompt"] = system_prompt
|
|
30
|
+
if allow_external_links is not None:
|
|
31
|
+
body["allowExternalLinks"] = allow_external_links
|
|
32
|
+
if enable_web_search is not None:
|
|
33
|
+
body["enableWebSearch"] = enable_web_search
|
|
34
|
+
if show_sources is not None:
|
|
35
|
+
body["showSources"] = show_sources
|
|
36
|
+
if ignore_invalid_urls is not None:
|
|
37
|
+
body["ignoreInvalidURLs"] = ignore_invalid_urls
|
|
38
|
+
if scrape_options is not None:
|
|
39
|
+
prepared = prepare_scrape_options(scrape_options)
|
|
40
|
+
if prepared:
|
|
41
|
+
body["scrapeOptions"] = prepared
|
|
42
|
+
return body
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def start_extract(
|
|
46
|
+
client: AsyncHttpClient,
|
|
47
|
+
urls: Optional[List[str]],
|
|
48
|
+
*,
|
|
49
|
+
prompt: Optional[str] = None,
|
|
50
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
51
|
+
system_prompt: Optional[str] = None,
|
|
52
|
+
allow_external_links: Optional[bool] = None,
|
|
53
|
+
enable_web_search: Optional[bool] = None,
|
|
54
|
+
show_sources: Optional[bool] = None,
|
|
55
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
56
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
57
|
+
) -> ExtractResponse:
|
|
58
|
+
body = _prepare_extract_request(
|
|
59
|
+
urls,
|
|
60
|
+
prompt=prompt,
|
|
61
|
+
schema=schema,
|
|
62
|
+
system_prompt=system_prompt,
|
|
63
|
+
allow_external_links=allow_external_links,
|
|
64
|
+
enable_web_search=enable_web_search,
|
|
65
|
+
show_sources=show_sources,
|
|
66
|
+
scrape_options=scrape_options,
|
|
67
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
68
|
+
)
|
|
69
|
+
resp = await client.post("/v2/extract", body)
|
|
70
|
+
return ExtractResponse(**resp.json())
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def get_extract_status(client: AsyncHttpClient, job_id: str) -> ExtractResponse:
|
|
74
|
+
resp = await client.get(f"/v2/extract/{job_id}")
|
|
75
|
+
return ExtractResponse(**resp.json())
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def wait_extract(
|
|
79
|
+
client: AsyncHttpClient,
|
|
80
|
+
job_id: str,
|
|
81
|
+
*,
|
|
82
|
+
poll_interval: int = 2,
|
|
83
|
+
timeout: Optional[int] = None,
|
|
84
|
+
) -> ExtractResponse:
|
|
85
|
+
start_ts = asyncio.get_event_loop().time()
|
|
86
|
+
while True:
|
|
87
|
+
status = await get_extract_status(client, job_id)
|
|
88
|
+
if status.status in ("completed", "failed", "cancelled"):
|
|
89
|
+
return status
|
|
90
|
+
if timeout is not None and (asyncio.get_event_loop().time() - start_ts) > timeout:
|
|
91
|
+
return status
|
|
92
|
+
await asyncio.sleep(max(1, poll_interval))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
async def extract(
|
|
96
|
+
client: AsyncHttpClient,
|
|
97
|
+
urls: Optional[List[str]],
|
|
98
|
+
*,
|
|
99
|
+
prompt: Optional[str] = None,
|
|
100
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
101
|
+
system_prompt: Optional[str] = None,
|
|
102
|
+
allow_external_links: Optional[bool] = None,
|
|
103
|
+
enable_web_search: Optional[bool] = None,
|
|
104
|
+
show_sources: Optional[bool] = None,
|
|
105
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
106
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
107
|
+
poll_interval: int = 2,
|
|
108
|
+
timeout: Optional[int] = None,
|
|
109
|
+
) -> ExtractResponse:
|
|
110
|
+
started = await start_extract(
|
|
111
|
+
client,
|
|
112
|
+
urls,
|
|
113
|
+
prompt=prompt,
|
|
114
|
+
schema=schema,
|
|
115
|
+
system_prompt=system_prompt,
|
|
116
|
+
allow_external_links=allow_external_links,
|
|
117
|
+
enable_web_search=enable_web_search,
|
|
118
|
+
show_sources=show_sources,
|
|
119
|
+
scrape_options=scrape_options,
|
|
120
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
121
|
+
)
|
|
122
|
+
job_id = getattr(started, "id", None)
|
|
123
|
+
if not job_id:
|
|
124
|
+
return started
|
|
125
|
+
return await wait_extract(client, job_id, poll_interval=poll_interval, timeout=timeout)
|
|
126
|
+
|