firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,59 @@
1
+ from typing import Optional, Dict, Any
2
+ from ...types import MapOptions, MapData, LinkResult
3
+ from ...utils.http_client_async import AsyncHttpClient
4
+ from ...utils.error_handler import handle_response_error
5
+
6
+
7
+ def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict[str, Any]:
8
+ if not url or not url.strip():
9
+ raise ValueError("URL cannot be empty")
10
+ payload: Dict[str, Any] = {"url": url.strip()}
11
+ if options is not None:
12
+ data: Dict[str, Any] = {}
13
+ if getattr(options, "sitemap", None) is not None:
14
+ data["sitemap"] = options.sitemap
15
+ if options.search is not None:
16
+ data["search"] = options.search
17
+ if options.include_subdomains is not None:
18
+ data["includeSubdomains"] = options.include_subdomains
19
+ if options.limit is not None:
20
+ data["limit"] = options.limit
21
+ if options.timeout is not None:
22
+ data["timeout"] = options.timeout
23
+ payload.update(data)
24
+ return payload
25
+
26
+
27
+ async def map(client: AsyncHttpClient, url: str, options: Optional[MapOptions] = None) -> MapData:
28
+ request_data = _prepare_map_request(url, options)
29
+ response = await client.post("/v2/map", request_data)
30
+ if response.status_code >= 400:
31
+ handle_response_error(response, "map")
32
+ body = response.json()
33
+ if not body.get("success"):
34
+ raise Exception(body.get("error", "Unknown error occurred"))
35
+
36
+
37
+ # data = body.get("data", {})
38
+ # result_links: list[LinkResult] = []
39
+ # for item in data.get("links", []):
40
+ # if isinstance(item, dict):
41
+ # result_links.append(
42
+ # LinkResult(
43
+ # url=item.get("url", ""),
44
+ # title=item.get("title"),
45
+ # description=item.get("description"),
46
+ # )
47
+ # )
48
+ # elif isinstance(item, str):
49
+ # result_links.append(LinkResult(url=item))
50
+
51
+ result_links: list[LinkResult] = []
52
+ for item in body.get("links", []):
53
+ if isinstance(item, dict):
54
+ result_links.append(LinkResult(url=item.get("url", ""), title=item.get("title"), description=item.get("description")))
55
+ elif isinstance(item, str):
56
+ result_links.append(LinkResult(url=item))
57
+
58
+ return MapData(links=result_links)
59
+
@@ -0,0 +1,36 @@
1
+ from typing import Optional, Dict, Any
2
+ from ...types import ScrapeOptions, Document
3
+ from ...utils.error_handler import handle_response_error
4
+ from ...utils.validation import prepare_scrape_options, validate_scrape_options
5
+ from ...utils.http_client_async import AsyncHttpClient
6
+
7
+
8
+ async def _prepare_scrape_request(url: str, options: Optional[ScrapeOptions] = None) -> Dict[str, Any]:
9
+ if not url or not url.strip():
10
+ raise ValueError("URL cannot be empty")
11
+ payload: Dict[str, Any] = {"url": url.strip()}
12
+ if options is not None:
13
+ validated = validate_scrape_options(options)
14
+ if validated is not None:
15
+ opts = prepare_scrape_options(validated)
16
+ if opts:
17
+ payload.update(opts)
18
+ return payload
19
+
20
+
21
+ async def scrape(client: AsyncHttpClient, url: str, options: Optional[ScrapeOptions] = None) -> Document:
22
+ payload = await _prepare_scrape_request(url, options)
23
+ response = await client.post("/v2/scrape", payload)
24
+ if response.status_code >= 400:
25
+ handle_response_error(response, "scrape")
26
+ body = response.json()
27
+ if not body.get("success"):
28
+ raise Exception(body.get("error", "Unknown error occurred"))
29
+ document_data = body.get("data", {})
30
+ normalized = dict(document_data)
31
+ if 'rawHtml' in normalized and 'raw_html' not in normalized:
32
+ normalized['raw_html'] = normalized.pop('rawHtml')
33
+ if 'changeTracking' in normalized and 'change_tracking' not in normalized:
34
+ normalized['change_tracking'] = normalized.pop('changeTracking')
35
+ return Document(**normalized)
36
+
@@ -0,0 +1,58 @@
1
+ from typing import Dict, Any
2
+ from ...types import SearchRequest, SearchData, SearchResult, Document
3
+ from ...utils.http_client_async import AsyncHttpClient
4
+ from ...utils.error_handler import handle_response_error
5
+ from ...utils.validation import prepare_scrape_options, validate_scrape_options
6
+
7
+
8
+ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
9
+ data = request.model_dump(exclude_none=True)
10
+ if request.ignore_invalid_urls is not None:
11
+ data["ignoreInvalidURLs"] = request.ignore_invalid_urls
12
+ data.pop("ignore_invalid_urls", None)
13
+ if request.scrape_options is not None:
14
+ validate_scrape_options(request.scrape_options)
15
+ scrape_data = prepare_scrape_options(request.scrape_options)
16
+ if scrape_data:
17
+ data["scrapeOptions"] = scrape_data
18
+ data.pop("scrape_options", None)
19
+ return data
20
+
21
+
22
+ async def search(client: AsyncHttpClient, request: SearchRequest) -> SearchData:
23
+ payload = _prepare_search_request(request)
24
+ response = await client.post("/v2/search", payload)
25
+ if response.status_code >= 400:
26
+ handle_response_error(response, "search")
27
+ body = response.json()
28
+ if not body.get("success"):
29
+ raise Exception(body.get("error", "Unknown error occurred"))
30
+
31
+ data = body.get("data", {})
32
+ search_data = SearchData()
33
+ for source_type, source_documents in data.items():
34
+ if isinstance(source_documents, list):
35
+ results = []
36
+ for doc_data in source_documents:
37
+ if isinstance(doc_data, dict):
38
+ if request.scrape_options is not None and any(
39
+ key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
40
+ ):
41
+ normalized = dict(doc_data)
42
+ if 'rawHtml' in normalized and 'raw_html' not in normalized:
43
+ normalized['raw_html'] = normalized.pop('rawHtml')
44
+ if 'changeTracking' in normalized and 'change_tracking' not in normalized:
45
+ normalized['change_tracking'] = normalized.pop('changeTracking')
46
+ results.append(Document(**normalized))
47
+ else:
48
+ results.append(SearchResult(
49
+ url=doc_data.get('url', ''),
50
+ title=doc_data.get('title'),
51
+ description=doc_data.get('description')
52
+ ))
53
+ elif isinstance(doc_data, str):
54
+ results.append(SearchResult(url=doc_data))
55
+ if hasattr(search_data, source_type):
56
+ setattr(search_data, source_type, results)
57
+ return search_data
58
+
@@ -0,0 +1,42 @@
1
+ from ...utils.http_client_async import AsyncHttpClient
2
+ from ...utils.error_handler import handle_response_error
3
+ from ...types import ConcurrencyCheck, CreditUsage, TokenUsage
4
+
5
+
6
+ async def get_concurrency(client: AsyncHttpClient) -> ConcurrencyCheck:
7
+ resp = await client.get("/v2/concurrency-check")
8
+ if resp.status_code >= 400:
9
+ handle_response_error(resp, "get concurrency")
10
+ body = resp.json()
11
+ if not body.get("success"):
12
+ raise Exception(body.get("error", "Unknown error"))
13
+ data = body.get("data", body)
14
+ return ConcurrencyCheck(
15
+ concurrency=data.get("concurrency"),
16
+ max_concurrency=data.get("maxConcurrency", data.get("max_concurrency")),
17
+ )
18
+
19
+
20
+ async def get_credit_usage(client: AsyncHttpClient) -> CreditUsage:
21
+ resp = await client.get("/v2/team/credit-usage")
22
+ if resp.status_code >= 400:
23
+ handle_response_error(resp, "get credit usage")
24
+ body = resp.json()
25
+ if not body.get("success"):
26
+ raise Exception(body.get("error", "Unknown error"))
27
+ data = body.get("data", body)
28
+ return CreditUsage(remaining_credits=data.get("remainingCredits", data.get("remaining_credits", 0)))
29
+
30
+
31
+ async def get_token_usage(client: AsyncHttpClient) -> TokenUsage:
32
+ resp = await client.get("/v2/team/token-usage")
33
+ if resp.status_code >= 400:
34
+ handle_response_error(resp, "get token usage")
35
+ body = resp.json()
36
+ if not body.get("success"):
37
+ raise Exception(body.get("error", "Unknown error"))
38
+ data = body.get("data", body)
39
+ return TokenUsage(
40
+ remaining_tokens=data.get("remainingTokens", 0)
41
+ )
42
+
@@ -0,0 +1,420 @@
1
+ """
2
+ Batch scraping functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ import time
6
+ from typing import Optional, List, Callable, Dict, Any, Union
7
+ from ..types import (
8
+ BatchScrapeRequest,
9
+ BatchScrapeResponse,
10
+ BatchScrapeJob,
11
+ ScrapeOptions,
12
+ Document,
13
+ WebhookConfig,
14
+ )
15
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
16
+ from ..types import CrawlErrorsResponse
17
+
18
+
19
+ def start_batch_scrape(
20
+ client: HttpClient,
21
+ urls: List[str],
22
+ *,
23
+ options: Optional[ScrapeOptions] = None,
24
+ webhook: Optional[Union[str, WebhookConfig]] = None,
25
+ append_to_id: Optional[str] = None,
26
+ ignore_invalid_urls: Optional[bool] = None,
27
+ max_concurrency: Optional[int] = None,
28
+ zero_data_retention: Optional[bool] = None,
29
+ integration: Optional[str] = None,
30
+ idempotency_key: Optional[str] = None,
31
+ ) -> BatchScrapeResponse:
32
+ """
33
+ Start a batch scrape job for multiple URLs.
34
+
35
+ Args:
36
+ client: HTTP client instance
37
+ urls: List of URLs to scrape
38
+ options: Scraping options
39
+
40
+ Returns:
41
+ BatchScrapeResponse containing job information
42
+
43
+ Raises:
44
+ FirecrawlError: If the batch scrape operation fails to start
45
+ """
46
+ # Prepare request data
47
+ request_data = prepare_batch_scrape_request(
48
+ urls,
49
+ options=options,
50
+ webhook=webhook,
51
+ append_to_id=append_to_id,
52
+ ignore_invalid_urls=ignore_invalid_urls,
53
+ max_concurrency=max_concurrency,
54
+ zero_data_retention=zero_data_retention,
55
+ integration=integration,
56
+ )
57
+
58
+ # Make the API request
59
+ headers = client._prepare_headers(idempotency_key) # type: ignore[attr-defined]
60
+ response = client.post("/v2/batch/scrape", request_data, headers=headers)
61
+
62
+ # Handle errors
63
+ if not response.ok:
64
+ handle_response_error(response, "start batch scrape")
65
+
66
+ # Parse response
67
+ body = response.json()
68
+ if not body.get("success"):
69
+ raise Exception(body.get("error", "Unknown error occurred"))
70
+ return BatchScrapeResponse(
71
+ id=body.get("id"),
72
+ url=body.get("url"),
73
+ invalid_urls=body.get("invalidURLs") or None,
74
+ )
75
+
76
+
77
+ def get_batch_scrape_status(
78
+ client: HttpClient,
79
+ job_id: str
80
+ ) -> BatchScrapeJob:
81
+ """
82
+ Get the status of a batch scrape job.
83
+
84
+ Args:
85
+ client: HTTP client instance
86
+ job_id: ID of the batch scrape job
87
+
88
+ Returns:
89
+ BatchScrapeStatusResponse containing job status and data
90
+
91
+ Raises:
92
+ FirecrawlError: If the status check fails
93
+ """
94
+ # Make the API request
95
+ response = client.get(f"/v2/batch/scrape/{job_id}")
96
+
97
+ # Handle errors
98
+ if not response.ok:
99
+ handle_response_error(response, "get batch scrape status")
100
+
101
+ # Parse response
102
+ body = response.json()
103
+ if not body.get("success"):
104
+ raise Exception(body.get("error", "Unknown error occurred"))
105
+
106
+ # Convert documents
107
+ documents: List[Document] = []
108
+ for doc in body.get("data", []) or []:
109
+ if isinstance(doc, dict):
110
+ normalized = dict(doc)
111
+ if 'rawHtml' in normalized and 'raw_html' not in normalized:
112
+ normalized['raw_html'] = normalized.pop('rawHtml')
113
+ if 'changeTracking' in normalized and 'change_tracking' not in normalized:
114
+ normalized['change_tracking'] = normalized.pop('changeTracking')
115
+ documents.append(Document(**normalized))
116
+
117
+ return BatchScrapeJob(
118
+ status=body.get("status"),
119
+ completed=body.get("completed", 0),
120
+ total=body.get("total", 0),
121
+ credits_used=body.get("creditsUsed"),
122
+ expires_at=body.get("expiresAt"),
123
+ next=body.get("next"),
124
+ data=documents,
125
+ )
126
+
127
+
128
+ def cancel_batch_scrape(
129
+ client: HttpClient,
130
+ job_id: str
131
+ ) -> bool:
132
+ """
133
+ Cancel a running batch scrape job.
134
+
135
+ Args:
136
+ client: HTTP client instance
137
+ job_id: ID of the batch scrape job to cancel
138
+
139
+ Returns:
140
+ BatchScrapeStatusResponse with updated status
141
+
142
+ Raises:
143
+ FirecrawlError: If the cancellation fails
144
+ """
145
+ # Make the API request
146
+ response = client.delete(f"/v2/batch/scrape/{job_id}")
147
+
148
+ # Handle errors
149
+ if not response.ok:
150
+ handle_response_error(response, "cancel batch scrape")
151
+
152
+ # Parse response
153
+ body = response.json()
154
+ return body.get("status") == "cancelled"
155
+
156
+
157
+ def wait_for_batch_completion(
158
+ client: HttpClient,
159
+ job_id: str,
160
+ poll_interval: int = 2,
161
+ timeout: Optional[int] = None
162
+ ) -> BatchScrapeJob:
163
+ """
164
+ Wait for a batch scrape job to complete, polling for status updates.
165
+
166
+ Args:
167
+ client: HTTP client instance
168
+ job_id: ID of the batch scrape job
169
+ poll_interval: Seconds between status checks
170
+ timeout: Maximum seconds to wait (None for no timeout)
171
+
172
+ Returns:
173
+ BatchScrapeStatusResponse when job completes
174
+
175
+ Raises:
176
+ FirecrawlError: If the job fails or timeout is reached
177
+ TimeoutError: If timeout is reached
178
+ """
179
+ start_time = time.time()
180
+
181
+ while True:
182
+ status_job = get_batch_scrape_status(client, job_id)
183
+
184
+ # Check if job is complete
185
+ if status_job.status in ["completed", "failed", "cancelled"]:
186
+ return status_job
187
+
188
+ # Check timeout
189
+ if timeout and (time.time() - start_time) > timeout:
190
+ raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
191
+
192
+ # Wait before next poll
193
+ time.sleep(poll_interval)
194
+
195
+
196
+ def batch_scrape(
197
+ client: HttpClient,
198
+ urls: List[str],
199
+ *,
200
+ options: Optional[ScrapeOptions] = None,
201
+ webhook: Optional[Union[str, WebhookConfig]] = None,
202
+ append_to_id: Optional[str] = None,
203
+ ignore_invalid_urls: Optional[bool] = None,
204
+ max_concurrency: Optional[int] = None,
205
+ zero_data_retention: Optional[bool] = None,
206
+ integration: Optional[str] = None,
207
+ idempotency_key: Optional[str] = None,
208
+ poll_interval: int = 2,
209
+ timeout: Optional[int] = None
210
+ ) -> BatchScrapeJob:
211
+ """
212
+ Start a batch scrape job and wait for it to complete.
213
+
214
+ Args:
215
+ client: HTTP client instance
216
+ urls: List of URLs to scrape
217
+ options: Scraping options
218
+ poll_interval: Seconds between status checks
219
+ timeout: Maximum seconds to wait (None for no timeout)
220
+
221
+ Returns:
222
+ BatchScrapeStatusResponse when job completes
223
+
224
+ Raises:
225
+ FirecrawlError: If the batch scrape fails to start or complete
226
+ TimeoutError: If timeout is reached
227
+ """
228
+ # Start the batch scrape
229
+ start = start_batch_scrape(
230
+ client,
231
+ urls,
232
+ options=options,
233
+ webhook=webhook,
234
+ append_to_id=append_to_id,
235
+ ignore_invalid_urls=ignore_invalid_urls,
236
+ max_concurrency=max_concurrency,
237
+ zero_data_retention=zero_data_retention,
238
+ integration=integration,
239
+ idempotency_key=idempotency_key,
240
+ )
241
+
242
+ job_id = start.id
243
+
244
+ # Wait for completion
245
+ return wait_for_batch_completion(
246
+ client, job_id, poll_interval, timeout
247
+ )
248
+
249
+
250
+ def validate_batch_urls(urls: List[str]) -> List[str]:
251
+ """
252
+ Validate and normalize a list of URLs for batch scraping.
253
+
254
+ Args:
255
+ urls: List of URLs to validate
256
+
257
+ Returns:
258
+ Validated list of URLs
259
+
260
+ Raises:
261
+ ValueError: If URLs are invalid
262
+ """
263
+ if not urls:
264
+ raise ValueError("URLs list cannot be empty")
265
+
266
+ if len(urls) > 1000: # Assuming API limit
267
+ raise ValueError("Too many URLs (maximum 1000)")
268
+
269
+ validated_urls = []
270
+ for url in urls:
271
+ if not url or not isinstance(url, str):
272
+ raise ValueError(f"Invalid URL: {url}")
273
+
274
+ # Basic URL validation
275
+ if not (url.startswith("http://") or url.startswith("https://")):
276
+ raise ValueError(f"URL must start with http:// or https://: {url}")
277
+
278
+ validated_urls.append(url.strip())
279
+
280
+ return validated_urls
281
+
282
+
283
+ def prepare_batch_scrape_request(
284
+ urls: List[str],
285
+ *,
286
+ options: Optional[ScrapeOptions] = None,
287
+ webhook: Optional[Union[str, WebhookConfig]] = None,
288
+ append_to_id: Optional[str] = None,
289
+ ignore_invalid_urls: Optional[bool] = None,
290
+ max_concurrency: Optional[int] = None,
291
+ zero_data_retention: Optional[bool] = None,
292
+ integration: Optional[str] = None,
293
+ ) -> dict:
294
+ """
295
+ Prepare a batch scrape request payload.
296
+
297
+ Args:
298
+ urls: List of URLs to scrape
299
+ options: Scraping options
300
+
301
+ Returns:
302
+ Request payload dictionary
303
+ """
304
+ validated_urls = validate_batch_urls(urls)
305
+ request_data: Dict[str, Any] = {"urls": validated_urls}
306
+
307
+ # Flatten scrape options at the top level (v2 behavior)
308
+ if options:
309
+ scrape_data = prepare_scrape_options(options)
310
+ if scrape_data:
311
+ request_data.update(scrape_data)
312
+
313
+ # Batch-specific fields
314
+ if webhook is not None:
315
+ if isinstance(webhook, str):
316
+ request_data["webhook"] = webhook
317
+ else:
318
+ request_data["webhook"] = webhook.model_dump(exclude_none=True)
319
+ if append_to_id is not None:
320
+ request_data["appendToId"] = append_to_id
321
+ if ignore_invalid_urls is not None:
322
+ request_data["ignoreInvalidURLs"] = ignore_invalid_urls
323
+ if max_concurrency is not None:
324
+ request_data["maxConcurrency"] = max_concurrency
325
+ if zero_data_retention is not None:
326
+ request_data["zeroDataRetention"] = zero_data_retention
327
+ if integration is not None:
328
+ request_data["integration"] = integration
329
+
330
+ return request_data
331
+
332
+
333
+ def chunk_urls(urls: List[str], chunk_size: int = 100) -> List[List[str]]:
334
+ """
335
+ Split a large list of URLs into smaller chunks for batch processing.
336
+
337
+ Args:
338
+ urls: List of URLs to chunk
339
+ chunk_size: Maximum size of each chunk
340
+
341
+ Returns:
342
+ List of URL chunks
343
+ """
344
+ chunks = []
345
+ for i in range(0, len(urls), chunk_size):
346
+ chunks.append(urls[i:i + chunk_size])
347
+ return chunks
348
+
349
+
350
+ def process_large_batch(
351
+ client: HttpClient,
352
+ urls: List[str],
353
+ options: Optional[ScrapeOptions] = None,
354
+ chunk_size: int = 100,
355
+ poll_interval: int = 2,
356
+ timeout: Optional[int] = None
357
+ ) -> List[Document]:
358
+ """
359
+ Process a large batch of URLs by splitting into smaller chunks.
360
+
361
+ Args:
362
+ client: HTTP client instance
363
+ urls: List of URLs to scrape
364
+ options: Scraping options
365
+ chunk_size: Size of each batch chunk
366
+ poll_interval: Seconds between status checks
367
+ timeout: Maximum seconds to wait per chunk
368
+
369
+ Returns:
370
+ List of all scraped documents
371
+
372
+ Raises:
373
+ FirecrawlError: If any chunk fails
374
+ """
375
+ url_chunks = chunk_urls(urls, chunk_size)
376
+ all_documents = []
377
+ completed_chunks = 0
378
+
379
+ for chunk in url_chunks:
380
+ # Process this chunk
381
+ result = batch_scrape(
382
+ client,
383
+ chunk,
384
+ options=options,
385
+ poll_interval=poll_interval,
386
+ timeout=timeout,
387
+ )
388
+
389
+ # Add documents from this chunk
390
+ if result.data:
391
+ all_documents.extend(result.data)
392
+
393
+ completed_chunks += 1
394
+
395
+ return all_documents
396
+
397
+
398
+ def get_batch_scrape_errors(client: HttpClient, job_id: str) -> CrawlErrorsResponse:
399
+ """
400
+ Get errors for a batch scrape job.
401
+
402
+ Args:
403
+ client: HTTP client instance
404
+ job_id: ID of the batch scrape job
405
+
406
+ Returns:
407
+ CrawlErrorsResponse with errors and robots-blocked URLs
408
+ """
409
+ response = client.get(f"/v2/batch/scrape/{job_id}/errors")
410
+
411
+ if not response.ok:
412
+ handle_response_error(response, "get batch scrape errors")
413
+
414
+ body = response.json()
415
+ payload = body.get("data", body)
416
+ normalized = {
417
+ "errors": payload.get("errors", []),
418
+ "robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
419
+ }
420
+ return CrawlErrorsResponse(**normalized)