firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch scraping functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional, List, Callable, Dict, Any, Union
|
|
7
|
+
from ..types import (
|
|
8
|
+
BatchScrapeRequest,
|
|
9
|
+
BatchScrapeResponse,
|
|
10
|
+
BatchScrapeJob,
|
|
11
|
+
ScrapeOptions,
|
|
12
|
+
Document,
|
|
13
|
+
WebhookConfig,
|
|
14
|
+
PaginationConfig,
|
|
15
|
+
)
|
|
16
|
+
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
17
|
+
from ..utils.normalize import normalize_document_input
|
|
18
|
+
from ..types import CrawlErrorsResponse
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def start_batch_scrape(
|
|
22
|
+
client: HttpClient,
|
|
23
|
+
urls: List[str],
|
|
24
|
+
*,
|
|
25
|
+
options: Optional[ScrapeOptions] = None,
|
|
26
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
27
|
+
append_to_id: Optional[str] = None,
|
|
28
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
29
|
+
max_concurrency: Optional[int] = None,
|
|
30
|
+
zero_data_retention: Optional[bool] = None,
|
|
31
|
+
integration: Optional[str] = None,
|
|
32
|
+
idempotency_key: Optional[str] = None,
|
|
33
|
+
) -> BatchScrapeResponse:
|
|
34
|
+
"""
|
|
35
|
+
Start a batch scrape job for multiple URLs.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
client: HTTP client instance
|
|
39
|
+
urls: List of URLs to scrape
|
|
40
|
+
options: Scraping options
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
BatchScrapeResponse containing job information
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
FirecrawlError: If the batch scrape operation fails to start
|
|
47
|
+
"""
|
|
48
|
+
# Prepare request data
|
|
49
|
+
request_data = prepare_batch_scrape_request(
|
|
50
|
+
urls,
|
|
51
|
+
options=options,
|
|
52
|
+
webhook=webhook,
|
|
53
|
+
append_to_id=append_to_id,
|
|
54
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
55
|
+
max_concurrency=max_concurrency,
|
|
56
|
+
zero_data_retention=zero_data_retention,
|
|
57
|
+
integration=integration,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Make the API request
|
|
61
|
+
headers = client._prepare_headers(idempotency_key) # type: ignore[attr-defined]
|
|
62
|
+
response = client.post("/v2/batch/scrape", request_data, headers=headers)
|
|
63
|
+
|
|
64
|
+
# Handle errors
|
|
65
|
+
if not response.ok:
|
|
66
|
+
handle_response_error(response, "start batch scrape")
|
|
67
|
+
|
|
68
|
+
# Parse response
|
|
69
|
+
body = response.json()
|
|
70
|
+
if not body.get("success"):
|
|
71
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
72
|
+
return BatchScrapeResponse(
|
|
73
|
+
id=body.get("id"),
|
|
74
|
+
url=body.get("url"),
|
|
75
|
+
invalid_urls=body.get("invalidURLs") or None,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_batch_scrape_status(
|
|
80
|
+
client: HttpClient,
|
|
81
|
+
job_id: str,
|
|
82
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
83
|
+
) -> BatchScrapeJob:
|
|
84
|
+
"""
|
|
85
|
+
Get the status of a batch scrape job.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
client: HTTP client instance
|
|
89
|
+
job_id: ID of the batch scrape job
|
|
90
|
+
pagination_config: Optional configuration for pagination behavior
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
BatchScrapeJob containing job status and data
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
FirecrawlError: If the status check fails
|
|
97
|
+
"""
|
|
98
|
+
# Make the API request
|
|
99
|
+
response = client.get(f"/v2/batch/scrape/{job_id}")
|
|
100
|
+
|
|
101
|
+
# Handle errors
|
|
102
|
+
if not response.ok:
|
|
103
|
+
handle_response_error(response, "get batch scrape status")
|
|
104
|
+
|
|
105
|
+
# Parse response
|
|
106
|
+
body = response.json()
|
|
107
|
+
if not body.get("success"):
|
|
108
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
109
|
+
|
|
110
|
+
# Convert documents
|
|
111
|
+
documents: List[Document] = []
|
|
112
|
+
for doc in body.get("data", []) or []:
|
|
113
|
+
if isinstance(doc, dict):
|
|
114
|
+
normalized = normalize_document_input(doc)
|
|
115
|
+
documents.append(Document(**normalized))
|
|
116
|
+
|
|
117
|
+
# Handle pagination if requested
|
|
118
|
+
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
119
|
+
if auto_paginate and body.get("next"):
|
|
120
|
+
documents = _fetch_all_batch_pages(
|
|
121
|
+
client,
|
|
122
|
+
body.get("next"),
|
|
123
|
+
documents,
|
|
124
|
+
pagination_config
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return BatchScrapeJob(
|
|
128
|
+
status=body.get("status"),
|
|
129
|
+
completed=body.get("completed", 0),
|
|
130
|
+
total=body.get("total", 0),
|
|
131
|
+
credits_used=body.get("creditsUsed"),
|
|
132
|
+
expires_at=body.get("expiresAt"),
|
|
133
|
+
next=body.get("next") if not auto_paginate else None,
|
|
134
|
+
data=documents,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _fetch_all_batch_pages(
|
|
139
|
+
client: HttpClient,
|
|
140
|
+
next_url: str,
|
|
141
|
+
initial_documents: List[Document],
|
|
142
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
143
|
+
) -> List[Document]:
|
|
144
|
+
"""
|
|
145
|
+
Fetch all pages of batch scrape results.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
client: HTTP client instance
|
|
149
|
+
next_url: URL for the next page
|
|
150
|
+
initial_documents: Documents from the first page
|
|
151
|
+
pagination_config: Optional configuration for pagination limits
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
List of all documents from all pages
|
|
155
|
+
"""
|
|
156
|
+
documents = initial_documents.copy()
|
|
157
|
+
current_url = next_url
|
|
158
|
+
page_count = 0
|
|
159
|
+
|
|
160
|
+
# Apply pagination limits
|
|
161
|
+
max_pages = pagination_config.max_pages if pagination_config else None
|
|
162
|
+
max_results = pagination_config.max_results if pagination_config else None
|
|
163
|
+
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
164
|
+
|
|
165
|
+
start_time = time.monotonic()
|
|
166
|
+
|
|
167
|
+
while current_url:
|
|
168
|
+
# Check pagination limits (treat 0 as a valid limit)
|
|
169
|
+
if (max_pages is not None) and page_count >= max_pages:
|
|
170
|
+
break
|
|
171
|
+
|
|
172
|
+
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
# Fetch next page
|
|
176
|
+
response = client.get(current_url)
|
|
177
|
+
|
|
178
|
+
if not response.ok:
|
|
179
|
+
# Log error but continue with what we have
|
|
180
|
+
import logging
|
|
181
|
+
logger = logging.getLogger("firecrawl")
|
|
182
|
+
logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
page_data = response.json()
|
|
186
|
+
|
|
187
|
+
if not page_data.get("success"):
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
# Add documents from this page
|
|
191
|
+
for doc in page_data.get("data", []) or []:
|
|
192
|
+
if isinstance(doc, dict):
|
|
193
|
+
# Check max_results limit
|
|
194
|
+
if max_results is not None and len(documents) >= max_results:
|
|
195
|
+
break
|
|
196
|
+
normalized = normalize_document_input(doc)
|
|
197
|
+
documents.append(Document(**normalized))
|
|
198
|
+
|
|
199
|
+
# Check if we hit max_results limit after adding all docs from this page
|
|
200
|
+
if max_results is not None and len(documents) >= max_results:
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
# Get next URL
|
|
204
|
+
current_url = page_data.get("next")
|
|
205
|
+
page_count += 1
|
|
206
|
+
|
|
207
|
+
return documents
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def cancel_batch_scrape(
|
|
211
|
+
client: HttpClient,
|
|
212
|
+
job_id: str
|
|
213
|
+
) -> bool:
|
|
214
|
+
"""
|
|
215
|
+
Cancel a running batch scrape job.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
client: HTTP client instance
|
|
219
|
+
job_id: ID of the batch scrape job to cancel
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
BatchScrapeStatusResponse with updated status
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
FirecrawlError: If the cancellation fails
|
|
226
|
+
"""
|
|
227
|
+
# Make the API request
|
|
228
|
+
response = client.delete(f"/v2/batch/scrape/{job_id}")
|
|
229
|
+
|
|
230
|
+
# Handle errors
|
|
231
|
+
if not response.ok:
|
|
232
|
+
handle_response_error(response, "cancel batch scrape")
|
|
233
|
+
|
|
234
|
+
# Parse response
|
|
235
|
+
body = response.json()
|
|
236
|
+
return body.get("status") == "cancelled"
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def wait_for_batch_completion(
|
|
240
|
+
client: HttpClient,
|
|
241
|
+
job_id: str,
|
|
242
|
+
poll_interval: int = 2,
|
|
243
|
+
timeout: Optional[int] = None
|
|
244
|
+
) -> BatchScrapeJob:
|
|
245
|
+
"""
|
|
246
|
+
Wait for a batch scrape job to complete, polling for status updates.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
client: HTTP client instance
|
|
250
|
+
job_id: ID of the batch scrape job
|
|
251
|
+
poll_interval: Seconds between status checks
|
|
252
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
BatchScrapeStatusResponse when job completes
|
|
256
|
+
|
|
257
|
+
Raises:
|
|
258
|
+
FirecrawlError: If the job fails or timeout is reached
|
|
259
|
+
TimeoutError: If timeout is reached
|
|
260
|
+
"""
|
|
261
|
+
start_time = time.monotonic()
|
|
262
|
+
|
|
263
|
+
while True:
|
|
264
|
+
status_job = get_batch_scrape_status(client, job_id)
|
|
265
|
+
|
|
266
|
+
# Check if job is complete
|
|
267
|
+
if status_job.status in ["completed", "failed", "cancelled"]:
|
|
268
|
+
return status_job
|
|
269
|
+
|
|
270
|
+
# Check timeout
|
|
271
|
+
if timeout and (time.monotonic() - start_time) > timeout:
|
|
272
|
+
raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
|
|
273
|
+
|
|
274
|
+
# Wait before next poll
|
|
275
|
+
time.sleep(poll_interval)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def batch_scrape(
|
|
279
|
+
client: HttpClient,
|
|
280
|
+
urls: List[str],
|
|
281
|
+
*,
|
|
282
|
+
options: Optional[ScrapeOptions] = None,
|
|
283
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
284
|
+
append_to_id: Optional[str] = None,
|
|
285
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
286
|
+
max_concurrency: Optional[int] = None,
|
|
287
|
+
zero_data_retention: Optional[bool] = None,
|
|
288
|
+
integration: Optional[str] = None,
|
|
289
|
+
idempotency_key: Optional[str] = None,
|
|
290
|
+
poll_interval: int = 2,
|
|
291
|
+
timeout: Optional[int] = None
|
|
292
|
+
) -> BatchScrapeJob:
|
|
293
|
+
"""
|
|
294
|
+
Start a batch scrape job and wait for it to complete.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
client: HTTP client instance
|
|
298
|
+
urls: List of URLs to scrape
|
|
299
|
+
options: Scraping options
|
|
300
|
+
poll_interval: Seconds between status checks
|
|
301
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
BatchScrapeStatusResponse when job completes
|
|
305
|
+
|
|
306
|
+
Raises:
|
|
307
|
+
FirecrawlError: If the batch scrape fails to start or complete
|
|
308
|
+
TimeoutError: If timeout is reached
|
|
309
|
+
"""
|
|
310
|
+
# Start the batch scrape
|
|
311
|
+
start = start_batch_scrape(
|
|
312
|
+
client,
|
|
313
|
+
urls,
|
|
314
|
+
options=options,
|
|
315
|
+
webhook=webhook,
|
|
316
|
+
append_to_id=append_to_id,
|
|
317
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
318
|
+
max_concurrency=max_concurrency,
|
|
319
|
+
zero_data_retention=zero_data_retention,
|
|
320
|
+
integration=integration,
|
|
321
|
+
idempotency_key=idempotency_key,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
job_id = start.id
|
|
325
|
+
|
|
326
|
+
# Wait for completion
|
|
327
|
+
return wait_for_batch_completion(
|
|
328
|
+
client, job_id, poll_interval, timeout
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def validate_batch_urls(urls: List[str]) -> List[str]:
|
|
333
|
+
"""
|
|
334
|
+
Validate and normalize a list of URLs for batch scraping.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
urls: List of URLs to validate
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Validated list of URLs
|
|
341
|
+
|
|
342
|
+
Raises:
|
|
343
|
+
ValueError: If URLs are invalid
|
|
344
|
+
"""
|
|
345
|
+
if not urls:
|
|
346
|
+
raise ValueError("URLs list cannot be empty")
|
|
347
|
+
|
|
348
|
+
validated_urls = []
|
|
349
|
+
for url in urls:
|
|
350
|
+
if not url or not isinstance(url, str):
|
|
351
|
+
raise ValueError(f"Invalid URL: {url}")
|
|
352
|
+
|
|
353
|
+
# Basic URL validation
|
|
354
|
+
if not (url.startswith("http://") or url.startswith("https://")):
|
|
355
|
+
raise ValueError(f"URL must start with http:// or https://: {url}")
|
|
356
|
+
|
|
357
|
+
validated_urls.append(url.strip())
|
|
358
|
+
|
|
359
|
+
return validated_urls
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def prepare_batch_scrape_request(
|
|
363
|
+
urls: List[str],
|
|
364
|
+
*,
|
|
365
|
+
options: Optional[ScrapeOptions] = None,
|
|
366
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
367
|
+
append_to_id: Optional[str] = None,
|
|
368
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
369
|
+
max_concurrency: Optional[int] = None,
|
|
370
|
+
zero_data_retention: Optional[bool] = None,
|
|
371
|
+
integration: Optional[str] = None,
|
|
372
|
+
) -> dict:
|
|
373
|
+
"""
|
|
374
|
+
Prepare a batch scrape request payload.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
urls: List of URLs to scrape
|
|
378
|
+
options: Scraping options
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Request payload dictionary
|
|
382
|
+
"""
|
|
383
|
+
validated_urls = validate_batch_urls(urls)
|
|
384
|
+
request_data: Dict[str, Any] = {"urls": validated_urls}
|
|
385
|
+
|
|
386
|
+
# Flatten scrape options at the top level (v2 behavior)
|
|
387
|
+
if options:
|
|
388
|
+
scrape_data = prepare_scrape_options(options)
|
|
389
|
+
if scrape_data:
|
|
390
|
+
request_data.update(scrape_data)
|
|
391
|
+
|
|
392
|
+
# Batch-specific fields
|
|
393
|
+
if webhook is not None:
|
|
394
|
+
if isinstance(webhook, str):
|
|
395
|
+
request_data["webhook"] = webhook
|
|
396
|
+
else:
|
|
397
|
+
request_data["webhook"] = webhook.model_dump(exclude_none=True)
|
|
398
|
+
if append_to_id is not None:
|
|
399
|
+
request_data["appendToId"] = append_to_id
|
|
400
|
+
if ignore_invalid_urls is not None:
|
|
401
|
+
request_data["ignoreInvalidURLs"] = ignore_invalid_urls
|
|
402
|
+
if max_concurrency is not None:
|
|
403
|
+
request_data["maxConcurrency"] = max_concurrency
|
|
404
|
+
if zero_data_retention is not None:
|
|
405
|
+
request_data["zeroDataRetention"] = zero_data_retention
|
|
406
|
+
if integration is not None:
|
|
407
|
+
request_data["integration"] = str(integration).strip()
|
|
408
|
+
|
|
409
|
+
return request_data
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def chunk_urls(urls: List[str], chunk_size: int = 100) -> List[List[str]]:
|
|
413
|
+
"""
|
|
414
|
+
Split a large list of URLs into smaller chunks for batch processing.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
urls: List of URLs to chunk
|
|
418
|
+
chunk_size: Maximum size of each chunk
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
List of URL chunks
|
|
422
|
+
"""
|
|
423
|
+
chunks = []
|
|
424
|
+
for i in range(0, len(urls), chunk_size):
|
|
425
|
+
chunks.append(urls[i:i + chunk_size])
|
|
426
|
+
return chunks
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def process_large_batch(
|
|
430
|
+
client: HttpClient,
|
|
431
|
+
urls: List[str],
|
|
432
|
+
options: Optional[ScrapeOptions] = None,
|
|
433
|
+
chunk_size: int = 100,
|
|
434
|
+
poll_interval: int = 2,
|
|
435
|
+
timeout: Optional[int] = None
|
|
436
|
+
) -> List[Document]:
|
|
437
|
+
"""
|
|
438
|
+
Process a large batch of URLs by splitting into smaller chunks.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
client: HTTP client instance
|
|
442
|
+
urls: List of URLs to scrape
|
|
443
|
+
options: Scraping options
|
|
444
|
+
chunk_size: Size of each batch chunk
|
|
445
|
+
poll_interval: Seconds between status checks
|
|
446
|
+
timeout: Maximum seconds to wait per chunk
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
List of all scraped documents
|
|
450
|
+
|
|
451
|
+
Raises:
|
|
452
|
+
FirecrawlError: If any chunk fails
|
|
453
|
+
"""
|
|
454
|
+
url_chunks = chunk_urls(urls, chunk_size)
|
|
455
|
+
all_documents = []
|
|
456
|
+
completed_chunks = 0
|
|
457
|
+
|
|
458
|
+
for chunk in url_chunks:
|
|
459
|
+
# Process this chunk
|
|
460
|
+
result = batch_scrape(
|
|
461
|
+
client,
|
|
462
|
+
chunk,
|
|
463
|
+
options=options,
|
|
464
|
+
poll_interval=poll_interval,
|
|
465
|
+
timeout=timeout,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
# Add documents from this chunk
|
|
469
|
+
if result.data:
|
|
470
|
+
all_documents.extend(result.data)
|
|
471
|
+
|
|
472
|
+
completed_chunks += 1
|
|
473
|
+
|
|
474
|
+
return all_documents
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def get_batch_scrape_errors(client: HttpClient, job_id: str) -> CrawlErrorsResponse:
|
|
478
|
+
"""
|
|
479
|
+
Get errors for a batch scrape job.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
client: HTTP client instance
|
|
483
|
+
job_id: ID of the batch scrape job
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
CrawlErrorsResponse with errors and robots-blocked URLs
|
|
487
|
+
"""
|
|
488
|
+
response = client.get(f"/v2/batch/scrape/{job_id}/errors")
|
|
489
|
+
|
|
490
|
+
if not response.ok:
|
|
491
|
+
handle_response_error(response, "get batch scrape errors")
|
|
492
|
+
|
|
493
|
+
body = response.json()
|
|
494
|
+
payload = body.get("data", body)
|
|
495
|
+
normalized = {
|
|
496
|
+
"errors": payload.get("errors", []),
|
|
497
|
+
"robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
|
|
498
|
+
}
|
|
499
|
+
return CrawlErrorsResponse(**normalized)
|