firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,499 @@
1
+ """
2
+ Batch scraping functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ import time
6
+ from typing import Optional, List, Callable, Dict, Any, Union
7
+ from ..types import (
8
+ BatchScrapeRequest,
9
+ BatchScrapeResponse,
10
+ BatchScrapeJob,
11
+ ScrapeOptions,
12
+ Document,
13
+ WebhookConfig,
14
+ PaginationConfig,
15
+ )
16
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
17
+ from ..utils.normalize import normalize_document_input
18
+ from ..types import CrawlErrorsResponse
19
+
20
+
21
+ def start_batch_scrape(
22
+ client: HttpClient,
23
+ urls: List[str],
24
+ *,
25
+ options: Optional[ScrapeOptions] = None,
26
+ webhook: Optional[Union[str, WebhookConfig]] = None,
27
+ append_to_id: Optional[str] = None,
28
+ ignore_invalid_urls: Optional[bool] = None,
29
+ max_concurrency: Optional[int] = None,
30
+ zero_data_retention: Optional[bool] = None,
31
+ integration: Optional[str] = None,
32
+ idempotency_key: Optional[str] = None,
33
+ ) -> BatchScrapeResponse:
34
+ """
35
+ Start a batch scrape job for multiple URLs.
36
+
37
+ Args:
38
+ client: HTTP client instance
39
+ urls: List of URLs to scrape
40
+ options: Scraping options
41
+
42
+ Returns:
43
+ BatchScrapeResponse containing job information
44
+
45
+ Raises:
46
+ FirecrawlError: If the batch scrape operation fails to start
47
+ """
48
+ # Prepare request data
49
+ request_data = prepare_batch_scrape_request(
50
+ urls,
51
+ options=options,
52
+ webhook=webhook,
53
+ append_to_id=append_to_id,
54
+ ignore_invalid_urls=ignore_invalid_urls,
55
+ max_concurrency=max_concurrency,
56
+ zero_data_retention=zero_data_retention,
57
+ integration=integration,
58
+ )
59
+
60
+ # Make the API request
61
+ headers = client._prepare_headers(idempotency_key) # type: ignore[attr-defined]
62
+ response = client.post("/v2/batch/scrape", request_data, headers=headers)
63
+
64
+ # Handle errors
65
+ if not response.ok:
66
+ handle_response_error(response, "start batch scrape")
67
+
68
+ # Parse response
69
+ body = response.json()
70
+ if not body.get("success"):
71
+ raise Exception(body.get("error", "Unknown error occurred"))
72
+ return BatchScrapeResponse(
73
+ id=body.get("id"),
74
+ url=body.get("url"),
75
+ invalid_urls=body.get("invalidURLs") or None,
76
+ )
77
+
78
+
79
+ def get_batch_scrape_status(
80
+ client: HttpClient,
81
+ job_id: str,
82
+ pagination_config: Optional[PaginationConfig] = None
83
+ ) -> BatchScrapeJob:
84
+ """
85
+ Get the status of a batch scrape job.
86
+
87
+ Args:
88
+ client: HTTP client instance
89
+ job_id: ID of the batch scrape job
90
+ pagination_config: Optional configuration for pagination behavior
91
+
92
+ Returns:
93
+ BatchScrapeJob containing job status and data
94
+
95
+ Raises:
96
+ FirecrawlError: If the status check fails
97
+ """
98
+ # Make the API request
99
+ response = client.get(f"/v2/batch/scrape/{job_id}")
100
+
101
+ # Handle errors
102
+ if not response.ok:
103
+ handle_response_error(response, "get batch scrape status")
104
+
105
+ # Parse response
106
+ body = response.json()
107
+ if not body.get("success"):
108
+ raise Exception(body.get("error", "Unknown error occurred"))
109
+
110
+ # Convert documents
111
+ documents: List[Document] = []
112
+ for doc in body.get("data", []) or []:
113
+ if isinstance(doc, dict):
114
+ normalized = normalize_document_input(doc)
115
+ documents.append(Document(**normalized))
116
+
117
+ # Handle pagination if requested
118
+ auto_paginate = pagination_config.auto_paginate if pagination_config else True
119
+ if auto_paginate and body.get("next"):
120
+ documents = _fetch_all_batch_pages(
121
+ client,
122
+ body.get("next"),
123
+ documents,
124
+ pagination_config
125
+ )
126
+
127
+ return BatchScrapeJob(
128
+ status=body.get("status"),
129
+ completed=body.get("completed", 0),
130
+ total=body.get("total", 0),
131
+ credits_used=body.get("creditsUsed"),
132
+ expires_at=body.get("expiresAt"),
133
+ next=body.get("next") if not auto_paginate else None,
134
+ data=documents,
135
+ )
136
+
137
+
138
+ def _fetch_all_batch_pages(
139
+ client: HttpClient,
140
+ next_url: str,
141
+ initial_documents: List[Document],
142
+ pagination_config: Optional[PaginationConfig] = None
143
+ ) -> List[Document]:
144
+ """
145
+ Fetch all pages of batch scrape results.
146
+
147
+ Args:
148
+ client: HTTP client instance
149
+ next_url: URL for the next page
150
+ initial_documents: Documents from the first page
151
+ pagination_config: Optional configuration for pagination limits
152
+
153
+ Returns:
154
+ List of all documents from all pages
155
+ """
156
+ documents = initial_documents.copy()
157
+ current_url = next_url
158
+ page_count = 0
159
+
160
+ # Apply pagination limits
161
+ max_pages = pagination_config.max_pages if pagination_config else None
162
+ max_results = pagination_config.max_results if pagination_config else None
163
+ max_wait_time = pagination_config.max_wait_time if pagination_config else None
164
+
165
+ start_time = time.monotonic()
166
+
167
+ while current_url:
168
+ # Check pagination limits (treat 0 as a valid limit)
169
+ if (max_pages is not None) and page_count >= max_pages:
170
+ break
171
+
172
+ if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
173
+ break
174
+
175
+ # Fetch next page
176
+ response = client.get(current_url)
177
+
178
+ if not response.ok:
179
+ # Log error but continue with what we have
180
+ import logging
181
+ logger = logging.getLogger("firecrawl")
182
+ logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
183
+ break
184
+
185
+ page_data = response.json()
186
+
187
+ if not page_data.get("success"):
188
+ break
189
+
190
+ # Add documents from this page
191
+ for doc in page_data.get("data", []) or []:
192
+ if isinstance(doc, dict):
193
+ # Check max_results limit
194
+ if max_results is not None and len(documents) >= max_results:
195
+ break
196
+ normalized = normalize_document_input(doc)
197
+ documents.append(Document(**normalized))
198
+
199
+ # Check if we hit max_results limit after adding all docs from this page
200
+ if max_results is not None and len(documents) >= max_results:
201
+ break
202
+
203
+ # Get next URL
204
+ current_url = page_data.get("next")
205
+ page_count += 1
206
+
207
+ return documents
208
+
209
+
210
+ def cancel_batch_scrape(
211
+ client: HttpClient,
212
+ job_id: str
213
+ ) -> bool:
214
+ """
215
+ Cancel a running batch scrape job.
216
+
217
+ Args:
218
+ client: HTTP client instance
219
+ job_id: ID of the batch scrape job to cancel
220
+
221
+ Returns:
222
+ BatchScrapeStatusResponse with updated status
223
+
224
+ Raises:
225
+ FirecrawlError: If the cancellation fails
226
+ """
227
+ # Make the API request
228
+ response = client.delete(f"/v2/batch/scrape/{job_id}")
229
+
230
+ # Handle errors
231
+ if not response.ok:
232
+ handle_response_error(response, "cancel batch scrape")
233
+
234
+ # Parse response
235
+ body = response.json()
236
+ return body.get("status") == "cancelled"
237
+
238
+
239
+ def wait_for_batch_completion(
240
+ client: HttpClient,
241
+ job_id: str,
242
+ poll_interval: int = 2,
243
+ timeout: Optional[int] = None
244
+ ) -> BatchScrapeJob:
245
+ """
246
+ Wait for a batch scrape job to complete, polling for status updates.
247
+
248
+ Args:
249
+ client: HTTP client instance
250
+ job_id: ID of the batch scrape job
251
+ poll_interval: Seconds between status checks
252
+ timeout: Maximum seconds to wait (None for no timeout)
253
+
254
+ Returns:
255
+ BatchScrapeStatusResponse when job completes
256
+
257
+ Raises:
258
+ FirecrawlError: If the job fails or timeout is reached
259
+ TimeoutError: If timeout is reached
260
+ """
261
+ start_time = time.monotonic()
262
+
263
+ while True:
264
+ status_job = get_batch_scrape_status(client, job_id)
265
+
266
+ # Check if job is complete
267
+ if status_job.status in ["completed", "failed", "cancelled"]:
268
+ return status_job
269
+
270
+ # Check timeout
271
+ if timeout and (time.monotonic() - start_time) > timeout:
272
+ raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
273
+
274
+ # Wait before next poll
275
+ time.sleep(poll_interval)
276
+
277
+
278
+ def batch_scrape(
279
+ client: HttpClient,
280
+ urls: List[str],
281
+ *,
282
+ options: Optional[ScrapeOptions] = None,
283
+ webhook: Optional[Union[str, WebhookConfig]] = None,
284
+ append_to_id: Optional[str] = None,
285
+ ignore_invalid_urls: Optional[bool] = None,
286
+ max_concurrency: Optional[int] = None,
287
+ zero_data_retention: Optional[bool] = None,
288
+ integration: Optional[str] = None,
289
+ idempotency_key: Optional[str] = None,
290
+ poll_interval: int = 2,
291
+ timeout: Optional[int] = None
292
+ ) -> BatchScrapeJob:
293
+ """
294
+ Start a batch scrape job and wait for it to complete.
295
+
296
+ Args:
297
+ client: HTTP client instance
298
+ urls: List of URLs to scrape
299
+ options: Scraping options
300
+ poll_interval: Seconds between status checks
301
+ timeout: Maximum seconds to wait (None for no timeout)
302
+
303
+ Returns:
304
+ BatchScrapeStatusResponse when job completes
305
+
306
+ Raises:
307
+ FirecrawlError: If the batch scrape fails to start or complete
308
+ TimeoutError: If timeout is reached
309
+ """
310
+ # Start the batch scrape
311
+ start = start_batch_scrape(
312
+ client,
313
+ urls,
314
+ options=options,
315
+ webhook=webhook,
316
+ append_to_id=append_to_id,
317
+ ignore_invalid_urls=ignore_invalid_urls,
318
+ max_concurrency=max_concurrency,
319
+ zero_data_retention=zero_data_retention,
320
+ integration=integration,
321
+ idempotency_key=idempotency_key,
322
+ )
323
+
324
+ job_id = start.id
325
+
326
+ # Wait for completion
327
+ return wait_for_batch_completion(
328
+ client, job_id, poll_interval, timeout
329
+ )
330
+
331
+
332
+ def validate_batch_urls(urls: List[str]) -> List[str]:
333
+ """
334
+ Validate and normalize a list of URLs for batch scraping.
335
+
336
+ Args:
337
+ urls: List of URLs to validate
338
+
339
+ Returns:
340
+ Validated list of URLs
341
+
342
+ Raises:
343
+ ValueError: If URLs are invalid
344
+ """
345
+ if not urls:
346
+ raise ValueError("URLs list cannot be empty")
347
+
348
+ validated_urls = []
349
+ for url in urls:
350
+ if not url or not isinstance(url, str):
351
+ raise ValueError(f"Invalid URL: {url}")
352
+
353
+ # Basic URL validation
354
+ if not (url.startswith("http://") or url.startswith("https://")):
355
+ raise ValueError(f"URL must start with http:// or https://: {url}")
356
+
357
+ validated_urls.append(url.strip())
358
+
359
+ return validated_urls
360
+
361
+
362
+ def prepare_batch_scrape_request(
363
+ urls: List[str],
364
+ *,
365
+ options: Optional[ScrapeOptions] = None,
366
+ webhook: Optional[Union[str, WebhookConfig]] = None,
367
+ append_to_id: Optional[str] = None,
368
+ ignore_invalid_urls: Optional[bool] = None,
369
+ max_concurrency: Optional[int] = None,
370
+ zero_data_retention: Optional[bool] = None,
371
+ integration: Optional[str] = None,
372
+ ) -> dict:
373
+ """
374
+ Prepare a batch scrape request payload.
375
+
376
+ Args:
377
+ urls: List of URLs to scrape
378
+ options: Scraping options
379
+
380
+ Returns:
381
+ Request payload dictionary
382
+ """
383
+ validated_urls = validate_batch_urls(urls)
384
+ request_data: Dict[str, Any] = {"urls": validated_urls}
385
+
386
+ # Flatten scrape options at the top level (v2 behavior)
387
+ if options:
388
+ scrape_data = prepare_scrape_options(options)
389
+ if scrape_data:
390
+ request_data.update(scrape_data)
391
+
392
+ # Batch-specific fields
393
+ if webhook is not None:
394
+ if isinstance(webhook, str):
395
+ request_data["webhook"] = webhook
396
+ else:
397
+ request_data["webhook"] = webhook.model_dump(exclude_none=True)
398
+ if append_to_id is not None:
399
+ request_data["appendToId"] = append_to_id
400
+ if ignore_invalid_urls is not None:
401
+ request_data["ignoreInvalidURLs"] = ignore_invalid_urls
402
+ if max_concurrency is not None:
403
+ request_data["maxConcurrency"] = max_concurrency
404
+ if zero_data_retention is not None:
405
+ request_data["zeroDataRetention"] = zero_data_retention
406
+ if integration is not None:
407
+ request_data["integration"] = str(integration).strip()
408
+
409
+ return request_data
410
+
411
+
412
+ def chunk_urls(urls: List[str], chunk_size: int = 100) -> List[List[str]]:
413
+ """
414
+ Split a large list of URLs into smaller chunks for batch processing.
415
+
416
+ Args:
417
+ urls: List of URLs to chunk
418
+ chunk_size: Maximum size of each chunk
419
+
420
+ Returns:
421
+ List of URL chunks
422
+ """
423
+ chunks = []
424
+ for i in range(0, len(urls), chunk_size):
425
+ chunks.append(urls[i:i + chunk_size])
426
+ return chunks
427
+
428
+
429
+ def process_large_batch(
430
+ client: HttpClient,
431
+ urls: List[str],
432
+ options: Optional[ScrapeOptions] = None,
433
+ chunk_size: int = 100,
434
+ poll_interval: int = 2,
435
+ timeout: Optional[int] = None
436
+ ) -> List[Document]:
437
+ """
438
+ Process a large batch of URLs by splitting into smaller chunks.
439
+
440
+ Args:
441
+ client: HTTP client instance
442
+ urls: List of URLs to scrape
443
+ options: Scraping options
444
+ chunk_size: Size of each batch chunk
445
+ poll_interval: Seconds between status checks
446
+ timeout: Maximum seconds to wait per chunk
447
+
448
+ Returns:
449
+ List of all scraped documents
450
+
451
+ Raises:
452
+ FirecrawlError: If any chunk fails
453
+ """
454
+ url_chunks = chunk_urls(urls, chunk_size)
455
+ all_documents = []
456
+ completed_chunks = 0
457
+
458
+ for chunk in url_chunks:
459
+ # Process this chunk
460
+ result = batch_scrape(
461
+ client,
462
+ chunk,
463
+ options=options,
464
+ poll_interval=poll_interval,
465
+ timeout=timeout,
466
+ )
467
+
468
+ # Add documents from this chunk
469
+ if result.data:
470
+ all_documents.extend(result.data)
471
+
472
+ completed_chunks += 1
473
+
474
+ return all_documents
475
+
476
+
477
+ def get_batch_scrape_errors(client: HttpClient, job_id: str) -> CrawlErrorsResponse:
478
+ """
479
+ Get errors for a batch scrape job.
480
+
481
+ Args:
482
+ client: HTTP client instance
483
+ job_id: ID of the batch scrape job
484
+
485
+ Returns:
486
+ CrawlErrorsResponse with errors and robots-blocked URLs
487
+ """
488
+ response = client.get(f"/v2/batch/scrape/{job_id}/errors")
489
+
490
+ if not response.ok:
491
+ handle_response_error(response, "get batch scrape errors")
492
+
493
+ body = response.json()
494
+ payload = body.get("data", body)
495
+ normalized = {
496
+ "errors": payload.get("errors", []),
497
+ "robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
498
+ }
499
+ return CrawlErrorsResponse(**normalized)