firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +4653 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
- firecrawl-3.0.3.dist-info/RECORD +78 -0
- tests/test_timeout_conversion.py +117 -0
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5.dist-info/RECORD +0 -12
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any
|
|
2
|
+
from ...types import MapOptions, MapData, LinkResult
|
|
3
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
4
|
+
from ...utils.error_handler import handle_response_error
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict[str, Any]:
|
|
8
|
+
if not url or not url.strip():
|
|
9
|
+
raise ValueError("URL cannot be empty")
|
|
10
|
+
payload: Dict[str, Any] = {"url": url.strip()}
|
|
11
|
+
if options is not None:
|
|
12
|
+
data: Dict[str, Any] = {}
|
|
13
|
+
if getattr(options, "sitemap", None) is not None:
|
|
14
|
+
data["sitemap"] = options.sitemap
|
|
15
|
+
if options.search is not None:
|
|
16
|
+
data["search"] = options.search
|
|
17
|
+
if options.include_subdomains is not None:
|
|
18
|
+
data["includeSubdomains"] = options.include_subdomains
|
|
19
|
+
if options.limit is not None:
|
|
20
|
+
data["limit"] = options.limit
|
|
21
|
+
if options.timeout is not None:
|
|
22
|
+
data["timeout"] = options.timeout
|
|
23
|
+
payload.update(data)
|
|
24
|
+
return payload
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def map(client: AsyncHttpClient, url: str, options: Optional[MapOptions] = None) -> MapData:
|
|
28
|
+
request_data = _prepare_map_request(url, options)
|
|
29
|
+
response = await client.post("/v2/map", request_data)
|
|
30
|
+
if response.status_code >= 400:
|
|
31
|
+
handle_response_error(response, "map")
|
|
32
|
+
body = response.json()
|
|
33
|
+
if not body.get("success"):
|
|
34
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# data = body.get("data", {})
|
|
38
|
+
# result_links: list[LinkResult] = []
|
|
39
|
+
# for item in data.get("links", []):
|
|
40
|
+
# if isinstance(item, dict):
|
|
41
|
+
# result_links.append(
|
|
42
|
+
# LinkResult(
|
|
43
|
+
# url=item.get("url", ""),
|
|
44
|
+
# title=item.get("title"),
|
|
45
|
+
# description=item.get("description"),
|
|
46
|
+
# )
|
|
47
|
+
# )
|
|
48
|
+
# elif isinstance(item, str):
|
|
49
|
+
# result_links.append(LinkResult(url=item))
|
|
50
|
+
|
|
51
|
+
result_links: list[LinkResult] = []
|
|
52
|
+
for item in body.get("links", []):
|
|
53
|
+
if isinstance(item, dict):
|
|
54
|
+
result_links.append(LinkResult(url=item.get("url", ""), title=item.get("title"), description=item.get("description")))
|
|
55
|
+
elif isinstance(item, str):
|
|
56
|
+
result_links.append(LinkResult(url=item))
|
|
57
|
+
|
|
58
|
+
return MapData(links=result_links)
|
|
59
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any
|
|
2
|
+
from ...types import ScrapeOptions, Document
|
|
3
|
+
from ...utils.error_handler import handle_response_error
|
|
4
|
+
from ...utils.validation import prepare_scrape_options, validate_scrape_options
|
|
5
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def _prepare_scrape_request(url: str, options: Optional[ScrapeOptions] = None) -> Dict[str, Any]:
|
|
9
|
+
if not url or not url.strip():
|
|
10
|
+
raise ValueError("URL cannot be empty")
|
|
11
|
+
payload: Dict[str, Any] = {"url": url.strip()}
|
|
12
|
+
if options is not None:
|
|
13
|
+
validated = validate_scrape_options(options)
|
|
14
|
+
if validated is not None:
|
|
15
|
+
opts = prepare_scrape_options(validated)
|
|
16
|
+
if opts:
|
|
17
|
+
payload.update(opts)
|
|
18
|
+
return payload
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def scrape(client: AsyncHttpClient, url: str, options: Optional[ScrapeOptions] = None) -> Document:
|
|
22
|
+
payload = await _prepare_scrape_request(url, options)
|
|
23
|
+
response = await client.post("/v2/scrape", payload)
|
|
24
|
+
if response.status_code >= 400:
|
|
25
|
+
handle_response_error(response, "scrape")
|
|
26
|
+
body = response.json()
|
|
27
|
+
if not body.get("success"):
|
|
28
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
29
|
+
document_data = body.get("data", {})
|
|
30
|
+
normalized = dict(document_data)
|
|
31
|
+
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
32
|
+
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
33
|
+
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
34
|
+
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
35
|
+
return Document(**normalized)
|
|
36
|
+
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from typing import Dict, Any
|
|
2
|
+
from ...types import SearchRequest, SearchData, SearchResult, Document
|
|
3
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
4
|
+
from ...utils.error_handler import handle_response_error
|
|
5
|
+
from ...utils.validation import prepare_scrape_options, validate_scrape_options
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
|
|
9
|
+
data = request.model_dump(exclude_none=True)
|
|
10
|
+
if request.ignore_invalid_urls is not None:
|
|
11
|
+
data["ignoreInvalidURLs"] = request.ignore_invalid_urls
|
|
12
|
+
data.pop("ignore_invalid_urls", None)
|
|
13
|
+
if request.scrape_options is not None:
|
|
14
|
+
validate_scrape_options(request.scrape_options)
|
|
15
|
+
scrape_data = prepare_scrape_options(request.scrape_options)
|
|
16
|
+
if scrape_data:
|
|
17
|
+
data["scrapeOptions"] = scrape_data
|
|
18
|
+
data.pop("scrape_options", None)
|
|
19
|
+
return data
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def search(client: AsyncHttpClient, request: SearchRequest) -> SearchData:
|
|
23
|
+
payload = _prepare_search_request(request)
|
|
24
|
+
response = await client.post("/v2/search", payload)
|
|
25
|
+
if response.status_code >= 400:
|
|
26
|
+
handle_response_error(response, "search")
|
|
27
|
+
body = response.json()
|
|
28
|
+
if not body.get("success"):
|
|
29
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
30
|
+
|
|
31
|
+
data = body.get("data", {})
|
|
32
|
+
search_data = SearchData()
|
|
33
|
+
for source_type, source_documents in data.items():
|
|
34
|
+
if isinstance(source_documents, list):
|
|
35
|
+
results = []
|
|
36
|
+
for doc_data in source_documents:
|
|
37
|
+
if isinstance(doc_data, dict):
|
|
38
|
+
if request.scrape_options is not None and any(
|
|
39
|
+
key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
|
|
40
|
+
):
|
|
41
|
+
normalized = dict(doc_data)
|
|
42
|
+
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
43
|
+
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
44
|
+
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
45
|
+
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
46
|
+
results.append(Document(**normalized))
|
|
47
|
+
else:
|
|
48
|
+
results.append(SearchResult(
|
|
49
|
+
url=doc_data.get('url', ''),
|
|
50
|
+
title=doc_data.get('title'),
|
|
51
|
+
description=doc_data.get('description')
|
|
52
|
+
))
|
|
53
|
+
elif isinstance(doc_data, str):
|
|
54
|
+
results.append(SearchResult(url=doc_data))
|
|
55
|
+
if hasattr(search_data, source_type):
|
|
56
|
+
setattr(search_data, source_type, results)
|
|
57
|
+
return search_data
|
|
58
|
+
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
2
|
+
from ...utils.error_handler import handle_response_error
|
|
3
|
+
from ...types import ConcurrencyCheck, CreditUsage, TokenUsage
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
async def get_concurrency(client: AsyncHttpClient) -> ConcurrencyCheck:
|
|
7
|
+
resp = await client.get("/v2/concurrency-check")
|
|
8
|
+
if resp.status_code >= 400:
|
|
9
|
+
handle_response_error(resp, "get concurrency")
|
|
10
|
+
body = resp.json()
|
|
11
|
+
if not body.get("success"):
|
|
12
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
13
|
+
data = body.get("data", body)
|
|
14
|
+
return ConcurrencyCheck(
|
|
15
|
+
concurrency=data.get("concurrency"),
|
|
16
|
+
max_concurrency=data.get("maxConcurrency", data.get("max_concurrency")),
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def get_credit_usage(client: AsyncHttpClient) -> CreditUsage:
|
|
21
|
+
resp = await client.get("/v2/team/credit-usage")
|
|
22
|
+
if resp.status_code >= 400:
|
|
23
|
+
handle_response_error(resp, "get credit usage")
|
|
24
|
+
body = resp.json()
|
|
25
|
+
if not body.get("success"):
|
|
26
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
27
|
+
data = body.get("data", body)
|
|
28
|
+
return CreditUsage(remaining_credits=data.get("remainingCredits", data.get("remaining_credits", 0)))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def get_token_usage(client: AsyncHttpClient) -> TokenUsage:
|
|
32
|
+
resp = await client.get("/v2/team/token-usage")
|
|
33
|
+
if resp.status_code >= 400:
|
|
34
|
+
handle_response_error(resp, "get token usage")
|
|
35
|
+
body = resp.json()
|
|
36
|
+
if not body.get("success"):
|
|
37
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
38
|
+
data = body.get("data", body)
|
|
39
|
+
return TokenUsage(
|
|
40
|
+
remaining_tokens=data.get("remainingTokens", 0)
|
|
41
|
+
)
|
|
42
|
+
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch scraping functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional, List, Callable, Dict, Any, Union
|
|
7
|
+
from ..types import (
|
|
8
|
+
BatchScrapeRequest,
|
|
9
|
+
BatchScrapeResponse,
|
|
10
|
+
BatchScrapeJob,
|
|
11
|
+
ScrapeOptions,
|
|
12
|
+
Document,
|
|
13
|
+
WebhookConfig,
|
|
14
|
+
)
|
|
15
|
+
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
16
|
+
from ..types import CrawlErrorsResponse
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def start_batch_scrape(
|
|
20
|
+
client: HttpClient,
|
|
21
|
+
urls: List[str],
|
|
22
|
+
*,
|
|
23
|
+
options: Optional[ScrapeOptions] = None,
|
|
24
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
25
|
+
append_to_id: Optional[str] = None,
|
|
26
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
27
|
+
max_concurrency: Optional[int] = None,
|
|
28
|
+
zero_data_retention: Optional[bool] = None,
|
|
29
|
+
integration: Optional[str] = None,
|
|
30
|
+
idempotency_key: Optional[str] = None,
|
|
31
|
+
) -> BatchScrapeResponse:
|
|
32
|
+
"""
|
|
33
|
+
Start a batch scrape job for multiple URLs.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
client: HTTP client instance
|
|
37
|
+
urls: List of URLs to scrape
|
|
38
|
+
options: Scraping options
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
BatchScrapeResponse containing job information
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
FirecrawlError: If the batch scrape operation fails to start
|
|
45
|
+
"""
|
|
46
|
+
# Prepare request data
|
|
47
|
+
request_data = prepare_batch_scrape_request(
|
|
48
|
+
urls,
|
|
49
|
+
options=options,
|
|
50
|
+
webhook=webhook,
|
|
51
|
+
append_to_id=append_to_id,
|
|
52
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
53
|
+
max_concurrency=max_concurrency,
|
|
54
|
+
zero_data_retention=zero_data_retention,
|
|
55
|
+
integration=integration,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Make the API request
|
|
59
|
+
headers = client._prepare_headers(idempotency_key) # type: ignore[attr-defined]
|
|
60
|
+
response = client.post("/v2/batch/scrape", request_data, headers=headers)
|
|
61
|
+
|
|
62
|
+
# Handle errors
|
|
63
|
+
if not response.ok:
|
|
64
|
+
handle_response_error(response, "start batch scrape")
|
|
65
|
+
|
|
66
|
+
# Parse response
|
|
67
|
+
body = response.json()
|
|
68
|
+
if not body.get("success"):
|
|
69
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
70
|
+
return BatchScrapeResponse(
|
|
71
|
+
id=body.get("id"),
|
|
72
|
+
url=body.get("url"),
|
|
73
|
+
invalid_urls=body.get("invalidURLs") or None,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_batch_scrape_status(
|
|
78
|
+
client: HttpClient,
|
|
79
|
+
job_id: str
|
|
80
|
+
) -> BatchScrapeJob:
|
|
81
|
+
"""
|
|
82
|
+
Get the status of a batch scrape job.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
client: HTTP client instance
|
|
86
|
+
job_id: ID of the batch scrape job
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
BatchScrapeStatusResponse containing job status and data
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
FirecrawlError: If the status check fails
|
|
93
|
+
"""
|
|
94
|
+
# Make the API request
|
|
95
|
+
response = client.get(f"/v2/batch/scrape/{job_id}")
|
|
96
|
+
|
|
97
|
+
# Handle errors
|
|
98
|
+
if not response.ok:
|
|
99
|
+
handle_response_error(response, "get batch scrape status")
|
|
100
|
+
|
|
101
|
+
# Parse response
|
|
102
|
+
body = response.json()
|
|
103
|
+
if not body.get("success"):
|
|
104
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
105
|
+
|
|
106
|
+
# Convert documents
|
|
107
|
+
documents: List[Document] = []
|
|
108
|
+
for doc in body.get("data", []) or []:
|
|
109
|
+
if isinstance(doc, dict):
|
|
110
|
+
normalized = dict(doc)
|
|
111
|
+
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
112
|
+
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
113
|
+
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
114
|
+
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
115
|
+
documents.append(Document(**normalized))
|
|
116
|
+
|
|
117
|
+
return BatchScrapeJob(
|
|
118
|
+
status=body.get("status"),
|
|
119
|
+
completed=body.get("completed", 0),
|
|
120
|
+
total=body.get("total", 0),
|
|
121
|
+
credits_used=body.get("creditsUsed"),
|
|
122
|
+
expires_at=body.get("expiresAt"),
|
|
123
|
+
next=body.get("next"),
|
|
124
|
+
data=documents,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cancel_batch_scrape(
|
|
129
|
+
client: HttpClient,
|
|
130
|
+
job_id: str
|
|
131
|
+
) -> bool:
|
|
132
|
+
"""
|
|
133
|
+
Cancel a running batch scrape job.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
client: HTTP client instance
|
|
137
|
+
job_id: ID of the batch scrape job to cancel
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
BatchScrapeStatusResponse with updated status
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
FirecrawlError: If the cancellation fails
|
|
144
|
+
"""
|
|
145
|
+
# Make the API request
|
|
146
|
+
response = client.delete(f"/v2/batch/scrape/{job_id}")
|
|
147
|
+
|
|
148
|
+
# Handle errors
|
|
149
|
+
if not response.ok:
|
|
150
|
+
handle_response_error(response, "cancel batch scrape")
|
|
151
|
+
|
|
152
|
+
# Parse response
|
|
153
|
+
body = response.json()
|
|
154
|
+
return body.get("status") == "cancelled"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def wait_for_batch_completion(
|
|
158
|
+
client: HttpClient,
|
|
159
|
+
job_id: str,
|
|
160
|
+
poll_interval: int = 2,
|
|
161
|
+
timeout: Optional[int] = None
|
|
162
|
+
) -> BatchScrapeJob:
|
|
163
|
+
"""
|
|
164
|
+
Wait for a batch scrape job to complete, polling for status updates.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
client: HTTP client instance
|
|
168
|
+
job_id: ID of the batch scrape job
|
|
169
|
+
poll_interval: Seconds between status checks
|
|
170
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
BatchScrapeStatusResponse when job completes
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
FirecrawlError: If the job fails or timeout is reached
|
|
177
|
+
TimeoutError: If timeout is reached
|
|
178
|
+
"""
|
|
179
|
+
start_time = time.time()
|
|
180
|
+
|
|
181
|
+
while True:
|
|
182
|
+
status_job = get_batch_scrape_status(client, job_id)
|
|
183
|
+
|
|
184
|
+
# Check if job is complete
|
|
185
|
+
if status_job.status in ["completed", "failed", "cancelled"]:
|
|
186
|
+
return status_job
|
|
187
|
+
|
|
188
|
+
# Check timeout
|
|
189
|
+
if timeout and (time.time() - start_time) > timeout:
|
|
190
|
+
raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
|
|
191
|
+
|
|
192
|
+
# Wait before next poll
|
|
193
|
+
time.sleep(poll_interval)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def batch_scrape(
|
|
197
|
+
client: HttpClient,
|
|
198
|
+
urls: List[str],
|
|
199
|
+
*,
|
|
200
|
+
options: Optional[ScrapeOptions] = None,
|
|
201
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
202
|
+
append_to_id: Optional[str] = None,
|
|
203
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
204
|
+
max_concurrency: Optional[int] = None,
|
|
205
|
+
zero_data_retention: Optional[bool] = None,
|
|
206
|
+
integration: Optional[str] = None,
|
|
207
|
+
idempotency_key: Optional[str] = None,
|
|
208
|
+
poll_interval: int = 2,
|
|
209
|
+
timeout: Optional[int] = None
|
|
210
|
+
) -> BatchScrapeJob:
|
|
211
|
+
"""
|
|
212
|
+
Start a batch scrape job and wait for it to complete.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
client: HTTP client instance
|
|
216
|
+
urls: List of URLs to scrape
|
|
217
|
+
options: Scraping options
|
|
218
|
+
poll_interval: Seconds between status checks
|
|
219
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
BatchScrapeStatusResponse when job completes
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
FirecrawlError: If the batch scrape fails to start or complete
|
|
226
|
+
TimeoutError: If timeout is reached
|
|
227
|
+
"""
|
|
228
|
+
# Start the batch scrape
|
|
229
|
+
start = start_batch_scrape(
|
|
230
|
+
client,
|
|
231
|
+
urls,
|
|
232
|
+
options=options,
|
|
233
|
+
webhook=webhook,
|
|
234
|
+
append_to_id=append_to_id,
|
|
235
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
236
|
+
max_concurrency=max_concurrency,
|
|
237
|
+
zero_data_retention=zero_data_retention,
|
|
238
|
+
integration=integration,
|
|
239
|
+
idempotency_key=idempotency_key,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
job_id = start.id
|
|
243
|
+
|
|
244
|
+
# Wait for completion
|
|
245
|
+
return wait_for_batch_completion(
|
|
246
|
+
client, job_id, poll_interval, timeout
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def validate_batch_urls(urls: List[str]) -> List[str]:
|
|
251
|
+
"""
|
|
252
|
+
Validate and normalize a list of URLs for batch scraping.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
urls: List of URLs to validate
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Validated list of URLs
|
|
259
|
+
|
|
260
|
+
Raises:
|
|
261
|
+
ValueError: If URLs are invalid
|
|
262
|
+
"""
|
|
263
|
+
if not urls:
|
|
264
|
+
raise ValueError("URLs list cannot be empty")
|
|
265
|
+
|
|
266
|
+
if len(urls) > 1000: # Assuming API limit
|
|
267
|
+
raise ValueError("Too many URLs (maximum 1000)")
|
|
268
|
+
|
|
269
|
+
validated_urls = []
|
|
270
|
+
for url in urls:
|
|
271
|
+
if not url or not isinstance(url, str):
|
|
272
|
+
raise ValueError(f"Invalid URL: {url}")
|
|
273
|
+
|
|
274
|
+
# Basic URL validation
|
|
275
|
+
if not (url.startswith("http://") or url.startswith("https://")):
|
|
276
|
+
raise ValueError(f"URL must start with http:// or https://: {url}")
|
|
277
|
+
|
|
278
|
+
validated_urls.append(url.strip())
|
|
279
|
+
|
|
280
|
+
return validated_urls
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def prepare_batch_scrape_request(
|
|
284
|
+
urls: List[str],
|
|
285
|
+
*,
|
|
286
|
+
options: Optional[ScrapeOptions] = None,
|
|
287
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
288
|
+
append_to_id: Optional[str] = None,
|
|
289
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
290
|
+
max_concurrency: Optional[int] = None,
|
|
291
|
+
zero_data_retention: Optional[bool] = None,
|
|
292
|
+
integration: Optional[str] = None,
|
|
293
|
+
) -> dict:
|
|
294
|
+
"""
|
|
295
|
+
Prepare a batch scrape request payload.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
urls: List of URLs to scrape
|
|
299
|
+
options: Scraping options
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Request payload dictionary
|
|
303
|
+
"""
|
|
304
|
+
validated_urls = validate_batch_urls(urls)
|
|
305
|
+
request_data: Dict[str, Any] = {"urls": validated_urls}
|
|
306
|
+
|
|
307
|
+
# Flatten scrape options at the top level (v2 behavior)
|
|
308
|
+
if options:
|
|
309
|
+
scrape_data = prepare_scrape_options(options)
|
|
310
|
+
if scrape_data:
|
|
311
|
+
request_data.update(scrape_data)
|
|
312
|
+
|
|
313
|
+
# Batch-specific fields
|
|
314
|
+
if webhook is not None:
|
|
315
|
+
if isinstance(webhook, str):
|
|
316
|
+
request_data["webhook"] = webhook
|
|
317
|
+
else:
|
|
318
|
+
request_data["webhook"] = webhook.model_dump(exclude_none=True)
|
|
319
|
+
if append_to_id is not None:
|
|
320
|
+
request_data["appendToId"] = append_to_id
|
|
321
|
+
if ignore_invalid_urls is not None:
|
|
322
|
+
request_data["ignoreInvalidURLs"] = ignore_invalid_urls
|
|
323
|
+
if max_concurrency is not None:
|
|
324
|
+
request_data["maxConcurrency"] = max_concurrency
|
|
325
|
+
if zero_data_retention is not None:
|
|
326
|
+
request_data["zeroDataRetention"] = zero_data_retention
|
|
327
|
+
if integration is not None:
|
|
328
|
+
request_data["integration"] = integration
|
|
329
|
+
|
|
330
|
+
return request_data
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def chunk_urls(urls: List[str], chunk_size: int = 100) -> List[List[str]]:
|
|
334
|
+
"""
|
|
335
|
+
Split a large list of URLs into smaller chunks for batch processing.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
urls: List of URLs to chunk
|
|
339
|
+
chunk_size: Maximum size of each chunk
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
List of URL chunks
|
|
343
|
+
"""
|
|
344
|
+
chunks = []
|
|
345
|
+
for i in range(0, len(urls), chunk_size):
|
|
346
|
+
chunks.append(urls[i:i + chunk_size])
|
|
347
|
+
return chunks
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def process_large_batch(
|
|
351
|
+
client: HttpClient,
|
|
352
|
+
urls: List[str],
|
|
353
|
+
options: Optional[ScrapeOptions] = None,
|
|
354
|
+
chunk_size: int = 100,
|
|
355
|
+
poll_interval: int = 2,
|
|
356
|
+
timeout: Optional[int] = None
|
|
357
|
+
) -> List[Document]:
|
|
358
|
+
"""
|
|
359
|
+
Process a large batch of URLs by splitting into smaller chunks.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
client: HTTP client instance
|
|
363
|
+
urls: List of URLs to scrape
|
|
364
|
+
options: Scraping options
|
|
365
|
+
chunk_size: Size of each batch chunk
|
|
366
|
+
poll_interval: Seconds between status checks
|
|
367
|
+
timeout: Maximum seconds to wait per chunk
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
List of all scraped documents
|
|
371
|
+
|
|
372
|
+
Raises:
|
|
373
|
+
FirecrawlError: If any chunk fails
|
|
374
|
+
"""
|
|
375
|
+
url_chunks = chunk_urls(urls, chunk_size)
|
|
376
|
+
all_documents = []
|
|
377
|
+
completed_chunks = 0
|
|
378
|
+
|
|
379
|
+
for chunk in url_chunks:
|
|
380
|
+
# Process this chunk
|
|
381
|
+
result = batch_scrape(
|
|
382
|
+
client,
|
|
383
|
+
chunk,
|
|
384
|
+
options=options,
|
|
385
|
+
poll_interval=poll_interval,
|
|
386
|
+
timeout=timeout,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Add documents from this chunk
|
|
390
|
+
if result.data:
|
|
391
|
+
all_documents.extend(result.data)
|
|
392
|
+
|
|
393
|
+
completed_chunks += 1
|
|
394
|
+
|
|
395
|
+
return all_documents
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def get_batch_scrape_errors(client: HttpClient, job_id: str) -> CrawlErrorsResponse:
|
|
399
|
+
"""
|
|
400
|
+
Get errors for a batch scrape job.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
client: HTTP client instance
|
|
404
|
+
job_id: ID of the batch scrape job
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
CrawlErrorsResponse with errors and robots-blocked URLs
|
|
408
|
+
"""
|
|
409
|
+
response = client.get(f"/v2/batch/scrape/{job_id}/errors")
|
|
410
|
+
|
|
411
|
+
if not response.ok:
|
|
412
|
+
handle_response_error(response, "get batch scrape errors")
|
|
413
|
+
|
|
414
|
+
body = response.json()
|
|
415
|
+
payload = body.get("data", body)
|
|
416
|
+
normalized = {
|
|
417
|
+
"errors": payload.get("errors", []),
|
|
418
|
+
"robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
|
|
419
|
+
}
|
|
420
|
+
return CrawlErrorsResponse(**normalized)
|