firecrawl-py 3.2.1__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (85) hide show
  1. build/lib/firecrawl/__init__.py +87 -0
  2. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +188 -0
  4. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
  8. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. build/lib/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. build/lib/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. build/lib/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. build/lib/firecrawl/__tests__/e2e/v2/test_search.py +269 -0
  18. build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  36. build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. build/lib/firecrawl/client.py +242 -0
  41. build/lib/firecrawl/firecrawl.backup.py +4635 -0
  42. build/lib/firecrawl/types.py +161 -0
  43. build/lib/firecrawl/v1/__init__.py +14 -0
  44. build/lib/firecrawl/v1/client.py +4653 -0
  45. build/lib/firecrawl/v2/__init__.py +4 -0
  46. build/lib/firecrawl/v2/client.py +802 -0
  47. build/lib/firecrawl/v2/client_async.py +250 -0
  48. build/lib/firecrawl/v2/methods/aio/__init__.py +1 -0
  49. build/lib/firecrawl/v2/methods/aio/batch.py +85 -0
  50. build/lib/firecrawl/v2/methods/aio/crawl.py +171 -0
  51. build/lib/firecrawl/v2/methods/aio/extract.py +126 -0
  52. build/lib/firecrawl/v2/methods/aio/map.py +59 -0
  53. build/lib/firecrawl/v2/methods/aio/scrape.py +33 -0
  54. build/lib/firecrawl/v2/methods/aio/search.py +172 -0
  55. build/lib/firecrawl/v2/methods/aio/usage.py +42 -0
  56. build/lib/firecrawl/v2/methods/batch.py +417 -0
  57. build/lib/firecrawl/v2/methods/crawl.py +469 -0
  58. build/lib/firecrawl/v2/methods/extract.py +131 -0
  59. build/lib/firecrawl/v2/methods/map.py +77 -0
  60. build/lib/firecrawl/v2/methods/scrape.py +64 -0
  61. build/lib/firecrawl/v2/methods/search.py +197 -0
  62. build/lib/firecrawl/v2/methods/usage.py +41 -0
  63. build/lib/firecrawl/v2/types.py +665 -0
  64. build/lib/firecrawl/v2/utils/__init__.py +9 -0
  65. build/lib/firecrawl/v2/utils/error_handler.py +107 -0
  66. build/lib/firecrawl/v2/utils/get_version.py +15 -0
  67. build/lib/firecrawl/v2/utils/http_client.py +153 -0
  68. build/lib/firecrawl/v2/utils/http_client_async.py +65 -0
  69. build/lib/firecrawl/v2/utils/normalize.py +107 -0
  70. build/lib/firecrawl/v2/utils/validation.py +324 -0
  71. build/lib/firecrawl/v2/watcher.py +301 -0
  72. build/lib/firecrawl/v2/watcher_async.py +242 -0
  73. build/lib/tests/test_change_tracking.py +98 -0
  74. build/lib/tests/test_timeout_conversion.py +117 -0
  75. firecrawl/__init__.py +1 -1
  76. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +2 -2
  77. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +6 -6
  78. firecrawl/v2/methods/search.py +11 -0
  79. firecrawl/v2/types.py +30 -1
  80. {firecrawl_py-3.2.1.dist-info/licenses → firecrawl_py-3.3.0.dist-info}/LICENSE +0 -0
  81. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/METADATA +3 -7
  82. firecrawl_py-3.3.0.dist-info/RECORD +153 -0
  83. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/WHEEL +1 -1
  84. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/top_level.txt +2 -0
  85. firecrawl_py-3.2.1.dist-info/RECORD +0 -79
@@ -0,0 +1,417 @@
1
+ """
2
+ Batch scraping functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ import time
6
+ from typing import Optional, List, Callable, Dict, Any, Union
7
+ from ..types import (
8
+ BatchScrapeRequest,
9
+ BatchScrapeResponse,
10
+ BatchScrapeJob,
11
+ ScrapeOptions,
12
+ Document,
13
+ WebhookConfig,
14
+ )
15
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
16
+ from ..utils.normalize import normalize_document_input
17
+ from ..types import CrawlErrorsResponse
18
+
19
+
20
+ def start_batch_scrape(
21
+ client: HttpClient,
22
+ urls: List[str],
23
+ *,
24
+ options: Optional[ScrapeOptions] = None,
25
+ webhook: Optional[Union[str, WebhookConfig]] = None,
26
+ append_to_id: Optional[str] = None,
27
+ ignore_invalid_urls: Optional[bool] = None,
28
+ max_concurrency: Optional[int] = None,
29
+ zero_data_retention: Optional[bool] = None,
30
+ integration: Optional[str] = None,
31
+ idempotency_key: Optional[str] = None,
32
+ ) -> BatchScrapeResponse:
33
+ """
34
+ Start a batch scrape job for multiple URLs.
35
+
36
+ Args:
37
+ client: HTTP client instance
38
+ urls: List of URLs to scrape
39
+ options: Scraping options
40
+
41
+ Returns:
42
+ BatchScrapeResponse containing job information
43
+
44
+ Raises:
45
+ FirecrawlError: If the batch scrape operation fails to start
46
+ """
47
+ # Prepare request data
48
+ request_data = prepare_batch_scrape_request(
49
+ urls,
50
+ options=options,
51
+ webhook=webhook,
52
+ append_to_id=append_to_id,
53
+ ignore_invalid_urls=ignore_invalid_urls,
54
+ max_concurrency=max_concurrency,
55
+ zero_data_retention=zero_data_retention,
56
+ integration=integration,
57
+ )
58
+
59
+ # Make the API request
60
+ headers = client._prepare_headers(idempotency_key) # type: ignore[attr-defined]
61
+ response = client.post("/v2/batch/scrape", request_data, headers=headers)
62
+
63
+ # Handle errors
64
+ if not response.ok:
65
+ handle_response_error(response, "start batch scrape")
66
+
67
+ # Parse response
68
+ body = response.json()
69
+ if not body.get("success"):
70
+ raise Exception(body.get("error", "Unknown error occurred"))
71
+ return BatchScrapeResponse(
72
+ id=body.get("id"),
73
+ url=body.get("url"),
74
+ invalid_urls=body.get("invalidURLs") or None,
75
+ )
76
+
77
+
78
+ def get_batch_scrape_status(
79
+ client: HttpClient,
80
+ job_id: str
81
+ ) -> BatchScrapeJob:
82
+ """
83
+ Get the status of a batch scrape job.
84
+
85
+ Args:
86
+ client: HTTP client instance
87
+ job_id: ID of the batch scrape job
88
+
89
+ Returns:
90
+ BatchScrapeStatusResponse containing job status and data
91
+
92
+ Raises:
93
+ FirecrawlError: If the status check fails
94
+ """
95
+ # Make the API request
96
+ response = client.get(f"/v2/batch/scrape/{job_id}")
97
+
98
+ # Handle errors
99
+ if not response.ok:
100
+ handle_response_error(response, "get batch scrape status")
101
+
102
+ # Parse response
103
+ body = response.json()
104
+ if not body.get("success"):
105
+ raise Exception(body.get("error", "Unknown error occurred"))
106
+
107
+ # Convert documents
108
+ documents: List[Document] = []
109
+ for doc in body.get("data", []) or []:
110
+ if isinstance(doc, dict):
111
+ normalized = normalize_document_input(doc)
112
+ documents.append(Document(**normalized))
113
+
114
+ return BatchScrapeJob(
115
+ status=body.get("status"),
116
+ completed=body.get("completed", 0),
117
+ total=body.get("total", 0),
118
+ credits_used=body.get("creditsUsed"),
119
+ expires_at=body.get("expiresAt"),
120
+ next=body.get("next"),
121
+ data=documents,
122
+ )
123
+
124
+
125
+ def cancel_batch_scrape(
126
+ client: HttpClient,
127
+ job_id: str
128
+ ) -> bool:
129
+ """
130
+ Cancel a running batch scrape job.
131
+
132
+ Args:
133
+ client: HTTP client instance
134
+ job_id: ID of the batch scrape job to cancel
135
+
136
+ Returns:
137
+ BatchScrapeStatusResponse with updated status
138
+
139
+ Raises:
140
+ FirecrawlError: If the cancellation fails
141
+ """
142
+ # Make the API request
143
+ response = client.delete(f"/v2/batch/scrape/{job_id}")
144
+
145
+ # Handle errors
146
+ if not response.ok:
147
+ handle_response_error(response, "cancel batch scrape")
148
+
149
+ # Parse response
150
+ body = response.json()
151
+ return body.get("status") == "cancelled"
152
+
153
+
154
+ def wait_for_batch_completion(
155
+ client: HttpClient,
156
+ job_id: str,
157
+ poll_interval: int = 2,
158
+ timeout: Optional[int] = None
159
+ ) -> BatchScrapeJob:
160
+ """
161
+ Wait for a batch scrape job to complete, polling for status updates.
162
+
163
+ Args:
164
+ client: HTTP client instance
165
+ job_id: ID of the batch scrape job
166
+ poll_interval: Seconds between status checks
167
+ timeout: Maximum seconds to wait (None for no timeout)
168
+
169
+ Returns:
170
+ BatchScrapeStatusResponse when job completes
171
+
172
+ Raises:
173
+ FirecrawlError: If the job fails or timeout is reached
174
+ TimeoutError: If timeout is reached
175
+ """
176
+ start_time = time.time()
177
+
178
+ while True:
179
+ status_job = get_batch_scrape_status(client, job_id)
180
+
181
+ # Check if job is complete
182
+ if status_job.status in ["completed", "failed", "cancelled"]:
183
+ return status_job
184
+
185
+ # Check timeout
186
+ if timeout and (time.time() - start_time) > timeout:
187
+ raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
188
+
189
+ # Wait before next poll
190
+ time.sleep(poll_interval)
191
+
192
+
193
+ def batch_scrape(
194
+ client: HttpClient,
195
+ urls: List[str],
196
+ *,
197
+ options: Optional[ScrapeOptions] = None,
198
+ webhook: Optional[Union[str, WebhookConfig]] = None,
199
+ append_to_id: Optional[str] = None,
200
+ ignore_invalid_urls: Optional[bool] = None,
201
+ max_concurrency: Optional[int] = None,
202
+ zero_data_retention: Optional[bool] = None,
203
+ integration: Optional[str] = None,
204
+ idempotency_key: Optional[str] = None,
205
+ poll_interval: int = 2,
206
+ timeout: Optional[int] = None
207
+ ) -> BatchScrapeJob:
208
+ """
209
+ Start a batch scrape job and wait for it to complete.
210
+
211
+ Args:
212
+ client: HTTP client instance
213
+ urls: List of URLs to scrape
214
+ options: Scraping options
215
+ poll_interval: Seconds between status checks
216
+ timeout: Maximum seconds to wait (None for no timeout)
217
+
218
+ Returns:
219
+ BatchScrapeStatusResponse when job completes
220
+
221
+ Raises:
222
+ FirecrawlError: If the batch scrape fails to start or complete
223
+ TimeoutError: If timeout is reached
224
+ """
225
+ # Start the batch scrape
226
+ start = start_batch_scrape(
227
+ client,
228
+ urls,
229
+ options=options,
230
+ webhook=webhook,
231
+ append_to_id=append_to_id,
232
+ ignore_invalid_urls=ignore_invalid_urls,
233
+ max_concurrency=max_concurrency,
234
+ zero_data_retention=zero_data_retention,
235
+ integration=integration,
236
+ idempotency_key=idempotency_key,
237
+ )
238
+
239
+ job_id = start.id
240
+
241
+ # Wait for completion
242
+ return wait_for_batch_completion(
243
+ client, job_id, poll_interval, timeout
244
+ )
245
+
246
+
247
+ def validate_batch_urls(urls: List[str]) -> List[str]:
248
+ """
249
+ Validate and normalize a list of URLs for batch scraping.
250
+
251
+ Args:
252
+ urls: List of URLs to validate
253
+
254
+ Returns:
255
+ Validated list of URLs
256
+
257
+ Raises:
258
+ ValueError: If URLs are invalid
259
+ """
260
+ if not urls:
261
+ raise ValueError("URLs list cannot be empty")
262
+
263
+ if len(urls) > 1000: # Assuming API limit
264
+ raise ValueError("Too many URLs (maximum 1000)")
265
+
266
+ validated_urls = []
267
+ for url in urls:
268
+ if not url or not isinstance(url, str):
269
+ raise ValueError(f"Invalid URL: {url}")
270
+
271
+ # Basic URL validation
272
+ if not (url.startswith("http://") or url.startswith("https://")):
273
+ raise ValueError(f"URL must start with http:// or https://: {url}")
274
+
275
+ validated_urls.append(url.strip())
276
+
277
+ return validated_urls
278
+
279
+
280
+ def prepare_batch_scrape_request(
281
+ urls: List[str],
282
+ *,
283
+ options: Optional[ScrapeOptions] = None,
284
+ webhook: Optional[Union[str, WebhookConfig]] = None,
285
+ append_to_id: Optional[str] = None,
286
+ ignore_invalid_urls: Optional[bool] = None,
287
+ max_concurrency: Optional[int] = None,
288
+ zero_data_retention: Optional[bool] = None,
289
+ integration: Optional[str] = None,
290
+ ) -> dict:
291
+ """
292
+ Prepare a batch scrape request payload.
293
+
294
+ Args:
295
+ urls: List of URLs to scrape
296
+ options: Scraping options
297
+
298
+ Returns:
299
+ Request payload dictionary
300
+ """
301
+ validated_urls = validate_batch_urls(urls)
302
+ request_data: Dict[str, Any] = {"urls": validated_urls}
303
+
304
+ # Flatten scrape options at the top level (v2 behavior)
305
+ if options:
306
+ scrape_data = prepare_scrape_options(options)
307
+ if scrape_data:
308
+ request_data.update(scrape_data)
309
+
310
+ # Batch-specific fields
311
+ if webhook is not None:
312
+ if isinstance(webhook, str):
313
+ request_data["webhook"] = webhook
314
+ else:
315
+ request_data["webhook"] = webhook.model_dump(exclude_none=True)
316
+ if append_to_id is not None:
317
+ request_data["appendToId"] = append_to_id
318
+ if ignore_invalid_urls is not None:
319
+ request_data["ignoreInvalidURLs"] = ignore_invalid_urls
320
+ if max_concurrency is not None:
321
+ request_data["maxConcurrency"] = max_concurrency
322
+ if zero_data_retention is not None:
323
+ request_data["zeroDataRetention"] = zero_data_retention
324
+ if integration is not None:
325
+ request_data["integration"] = integration
326
+
327
+ return request_data
328
+
329
+
330
+ def chunk_urls(urls: List[str], chunk_size: int = 100) -> List[List[str]]:
331
+ """
332
+ Split a large list of URLs into smaller chunks for batch processing.
333
+
334
+ Args:
335
+ urls: List of URLs to chunk
336
+ chunk_size: Maximum size of each chunk
337
+
338
+ Returns:
339
+ List of URL chunks
340
+ """
341
+ chunks = []
342
+ for i in range(0, len(urls), chunk_size):
343
+ chunks.append(urls[i:i + chunk_size])
344
+ return chunks
345
+
346
+
347
+ def process_large_batch(
348
+ client: HttpClient,
349
+ urls: List[str],
350
+ options: Optional[ScrapeOptions] = None,
351
+ chunk_size: int = 100,
352
+ poll_interval: int = 2,
353
+ timeout: Optional[int] = None
354
+ ) -> List[Document]:
355
+ """
356
+ Process a large batch of URLs by splitting into smaller chunks.
357
+
358
+ Args:
359
+ client: HTTP client instance
360
+ urls: List of URLs to scrape
361
+ options: Scraping options
362
+ chunk_size: Size of each batch chunk
363
+ poll_interval: Seconds between status checks
364
+ timeout: Maximum seconds to wait per chunk
365
+
366
+ Returns:
367
+ List of all scraped documents
368
+
369
+ Raises:
370
+ FirecrawlError: If any chunk fails
371
+ """
372
+ url_chunks = chunk_urls(urls, chunk_size)
373
+ all_documents = []
374
+ completed_chunks = 0
375
+
376
+ for chunk in url_chunks:
377
+ # Process this chunk
378
+ result = batch_scrape(
379
+ client,
380
+ chunk,
381
+ options=options,
382
+ poll_interval=poll_interval,
383
+ timeout=timeout,
384
+ )
385
+
386
+ # Add documents from this chunk
387
+ if result.data:
388
+ all_documents.extend(result.data)
389
+
390
+ completed_chunks += 1
391
+
392
+ return all_documents
393
+
394
+
395
+ def get_batch_scrape_errors(client: HttpClient, job_id: str) -> CrawlErrorsResponse:
396
+ """
397
+ Get errors for a batch scrape job.
398
+
399
+ Args:
400
+ client: HTTP client instance
401
+ job_id: ID of the batch scrape job
402
+
403
+ Returns:
404
+ CrawlErrorsResponse with errors and robots-blocked URLs
405
+ """
406
+ response = client.get(f"/v2/batch/scrape/{job_id}/errors")
407
+
408
+ if not response.ok:
409
+ handle_response_error(response, "get batch scrape errors")
410
+
411
+ body = response.json()
412
+ payload = body.get("data", body)
413
+ normalized = {
414
+ "errors": payload.get("errors", []),
415
+ "robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
416
+ }
417
+ return CrawlErrorsResponse(**normalized)