firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,468 @@
1
+ """
2
+ Crawling functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ import time
6
+ from typing import Optional, Dict, Any
7
+ from ..types import (
8
+ CrawlRequest,
9
+ CrawlJob,
10
+ CrawlResponse, Document, CrawlParamsRequest, CrawlParamsResponse, CrawlParamsData,
11
+ WebhookConfig, CrawlErrorsResponse, ActiveCrawlsResponse, ActiveCrawl
12
+ )
13
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
14
+
15
+
16
+ def _validate_crawl_request(request: CrawlRequest) -> None:
17
+ """
18
+ Validate crawl request parameters.
19
+
20
+ Args:
21
+ request: CrawlRequest to validate
22
+
23
+ Raises:
24
+ ValueError: If request is invalid
25
+ """
26
+ if not request.url or not request.url.strip():
27
+ raise ValueError("URL cannot be empty")
28
+
29
+ if request.limit is not None and request.limit <= 0:
30
+ raise ValueError("Limit must be positive")
31
+
32
+ # Validate scrape_options (if provided)
33
+ if request.scrape_options is not None:
34
+ validate_scrape_options(request.scrape_options)
35
+
36
+
37
+ def _prepare_crawl_request(request: CrawlRequest) -> dict:
38
+ """
39
+ Prepare crawl request for API submission.
40
+
41
+ Args:
42
+ request: CrawlRequest to prepare
43
+
44
+ Returns:
45
+ Dictionary ready for API submission
46
+ """
47
+ # Validate request
48
+ _validate_crawl_request(request)
49
+
50
+ # Start with basic data
51
+ data = {"url": request.url}
52
+
53
+ # Add prompt if present
54
+ if request.prompt:
55
+ data["prompt"] = request.prompt
56
+
57
+ # Handle scrape_options conversion first (before model_dump)
58
+ if request.scrape_options is not None:
59
+ scrape_data = prepare_scrape_options(request.scrape_options)
60
+ if scrape_data:
61
+ data["scrapeOptions"] = scrape_data
62
+
63
+ # Convert request to dict
64
+ request_data = request.model_dump(exclude_none=True, exclude_unset=True)
65
+
66
+ # Remove url, prompt, and scrape_options (already handled)
67
+ request_data.pop("url", None)
68
+ request_data.pop("prompt", None)
69
+ request_data.pop("scrape_options", None)
70
+
71
+ # Handle webhook conversion first (before model_dump)
72
+ if request.webhook is not None:
73
+ if isinstance(request.webhook, str):
74
+ data["webhook"] = request.webhook
75
+ else:
76
+ # Convert WebhookConfig to dict
77
+ data["webhook"] = request.webhook.model_dump(exclude_none=True)
78
+
79
+ # Convert other snake_case fields to camelCase
80
+ field_mappings = {
81
+ "include_paths": "includePaths",
82
+ "exclude_paths": "excludePaths",
83
+ "max_discovery_depth": "maxDiscoveryDepth",
84
+ "sitemap": "sitemap",
85
+ "ignore_query_parameters": "ignoreQueryParameters",
86
+ "crawl_entire_domain": "crawlEntireDomain",
87
+ "allow_external_links": "allowExternalLinks",
88
+ "allow_subdomains": "allowSubdomains",
89
+ "delay": "delay",
90
+ "max_concurrency": "maxConcurrency",
91
+ "zero_data_retention": "zeroDataRetention"
92
+ }
93
+
94
+ # Apply field mappings
95
+ for snake_case, camel_case in field_mappings.items():
96
+ if snake_case in request_data:
97
+ data[camel_case] = request_data.pop(snake_case)
98
+
99
+ # Add any remaining fields that don't need conversion (like limit)
100
+ data.update(request_data)
101
+
102
+ return data
103
+
104
+
105
+ def start_crawl(client: HttpClient, request: CrawlRequest) -> CrawlResponse:
106
+ """
107
+ Start a crawl job for a website.
108
+
109
+ Args:
110
+ client: HTTP client instance
111
+ request: CrawlRequest containing URL and options
112
+
113
+ Returns:
114
+ CrawlResponse with job information
115
+
116
+ Raises:
117
+ ValueError: If request is invalid
118
+ Exception: If the crawl operation fails to start
119
+ """
120
+ request_data = _prepare_crawl_request(request)
121
+
122
+ response = client.post("/v2/crawl", request_data)
123
+
124
+ if not response.ok:
125
+ handle_response_error(response, "start crawl")
126
+
127
+ response_data = response.json()
128
+
129
+ if response_data.get("success"):
130
+ job_data = {
131
+ "id": response_data.get("id"),
132
+ "url": response_data.get("url")
133
+ }
134
+
135
+ return CrawlResponse(**job_data)
136
+ else:
137
+ raise Exception(response_data.get("error", "Unknown error occurred"))
138
+
139
+
140
+ def get_crawl_status(client: HttpClient, job_id: str) -> CrawlJob:
141
+ """
142
+ Get the status of a crawl job.
143
+
144
+ Args:
145
+ client: HTTP client instance
146
+ job_id: ID of the crawl job
147
+
148
+ Returns:
149
+ CrawlJob with current status and data
150
+
151
+ Raises:
152
+ Exception: If the status check fails
153
+ """
154
+ # Make the API request
155
+ response = client.get(f"/v2/crawl/{job_id}")
156
+
157
+ # Handle errors
158
+ if not response.ok:
159
+ handle_response_error(response, "get crawl status")
160
+
161
+ # Parse response
162
+ response_data = response.json()
163
+
164
+ if response_data.get("success"):
165
+ # The API returns status fields at the top level, not in a data field
166
+
167
+ # Convert documents
168
+ documents = []
169
+ data_list = response_data.get("data", [])
170
+ for doc_data in data_list:
171
+ if isinstance(doc_data, str):
172
+ # Handle case where API returns just URLs - this shouldn't happen for crawl
173
+ # but we'll handle it gracefully
174
+ continue
175
+ else:
176
+ documents.append(Document(**doc_data))
177
+
178
+ # Create CrawlJob with current status and data
179
+ return CrawlJob(
180
+ status=response_data.get("status"),
181
+ completed=response_data.get("completed", 0),
182
+ total=response_data.get("total", 0),
183
+ credits_used=response_data.get("creditsUsed", 0),
184
+ expires_at=response_data.get("expiresAt"),
185
+ next=response_data.get("next", None),
186
+ data=documents
187
+ )
188
+ else:
189
+ raise Exception(response_data.get("error", "Unknown error occurred"))
190
+
191
+
192
+ def cancel_crawl(client: HttpClient, job_id: str) -> bool:
193
+ """
194
+ Cancel a running crawl job.
195
+
196
+ Args:
197
+ client: HTTP client instance
198
+ job_id: ID of the crawl job to cancel
199
+
200
+ Returns:
201
+ bool: True if the crawl was cancelled, False otherwise
202
+
203
+ Raises:
204
+ Exception: If the cancellation fails
205
+ """
206
+ response = client.delete(f"/v2/crawl/{job_id}")
207
+
208
+ if not response.ok:
209
+ handle_response_error(response, "cancel crawl")
210
+
211
+ response_data = response.json()
212
+
213
+ return response_data.get("status") == "cancelled"
214
+
215
+ def wait_for_crawl_completion(
216
+ client: HttpClient,
217
+ job_id: str,
218
+ poll_interval: int = 2,
219
+ timeout: Optional[int] = None
220
+ ) -> CrawlJob:
221
+ """
222
+ Wait for a crawl job to complete, polling for status updates.
223
+
224
+ Args:
225
+ client: HTTP client instance
226
+ job_id: ID of the crawl job
227
+ poll_interval: Seconds between status checks
228
+ timeout: Maximum seconds to wait (None for no timeout)
229
+
230
+ Returns:
231
+ CrawlJob when job completes
232
+
233
+ Raises:
234
+ Exception: If the job fails
235
+ TimeoutError: If timeout is reached
236
+ """
237
+ start_time = time.time()
238
+
239
+ while True:
240
+ crawl_job = get_crawl_status(client, job_id)
241
+
242
+ # Check if job is complete
243
+ if crawl_job.status in ["completed", "failed"]:
244
+ return crawl_job
245
+
246
+ # Check timeout
247
+ if timeout and (time.time() - start_time) > timeout:
248
+ raise TimeoutError(f"Crawl job {job_id} did not complete within {timeout} seconds")
249
+
250
+ # Wait before next poll
251
+ time.sleep(poll_interval)
252
+
253
+
254
+ def crawl(
255
+ client: HttpClient,
256
+ request: CrawlRequest,
257
+ poll_interval: int = 2,
258
+ timeout: Optional[int] = None
259
+ ) -> CrawlJob:
260
+ """
261
+ Start a crawl job and wait for it to complete.
262
+
263
+ Args:
264
+ client: HTTP client instance
265
+ request: CrawlRequest containing URL and options
266
+ poll_interval: Seconds between status checks
267
+ timeout: Maximum seconds to wait (None for no timeout)
268
+
269
+ Returns:
270
+ CrawlJob when job completes
271
+
272
+ Raises:
273
+ ValueError: If request is invalid
274
+ Exception: If the crawl fails to start or complete
275
+ TimeoutError: If timeout is reached
276
+ """
277
+ # Start the crawl
278
+ crawl_job = start_crawl(client, request)
279
+ job_id = crawl_job.id
280
+
281
+ # Wait for completion
282
+ return wait_for_crawl_completion(
283
+ client, job_id, poll_interval, timeout
284
+ )
285
+
286
+
287
+ def crawl_params_preview(client: HttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
288
+ """
289
+ Get crawl parameters from LLM based on URL and prompt.
290
+
291
+ Args:
292
+ client: HTTP client instance
293
+ request: CrawlParamsRequest containing URL and prompt
294
+
295
+ Returns:
296
+ CrawlParamsData containing suggested crawl options
297
+
298
+ Raises:
299
+ ValueError: If request is invalid
300
+ Exception: If the operation fails
301
+ """
302
+ # Validate request
303
+ if not request.url or not request.url.strip():
304
+ raise ValueError("URL cannot be empty")
305
+
306
+ if not request.prompt or not request.prompt.strip():
307
+ raise ValueError("Prompt cannot be empty")
308
+
309
+ # Prepare request data
310
+ request_data = {
311
+ "url": request.url,
312
+ "prompt": request.prompt
313
+ }
314
+
315
+ # Make the API request
316
+ response = client.post("/v2/crawl/params-preview", request_data)
317
+
318
+ # Handle errors
319
+ if not response.ok:
320
+ handle_response_error(response, "crawl params preview")
321
+
322
+ # Parse response
323
+ response_data = response.json()
324
+
325
+ if response_data.get("success"):
326
+ params_data = response_data.get("data", {})
327
+
328
+ # Convert camelCase to snake_case for CrawlParamsData
329
+ converted_params = {}
330
+ field_mappings = {
331
+ "includePaths": "include_paths",
332
+ "excludePaths": "exclude_paths",
333
+ "maxDiscoveryDepth": "max_discovery_depth",
334
+ "sitemap": "sitemap",
335
+ "ignoreQueryParameters": "ignore_query_parameters",
336
+ "crawlEntireDomain": "crawl_entire_domain",
337
+ "allowExternalLinks": "allow_external_links",
338
+ "allowSubdomains": "allow_subdomains",
339
+ "maxConcurrency": "max_concurrency",
340
+ "scrapeOptions": "scrape_options",
341
+ "zeroDataRetention": "zero_data_retention"
342
+ }
343
+
344
+ # Handle webhook conversion
345
+ if "webhook" in params_data:
346
+ webhook_data = params_data["webhook"]
347
+ if isinstance(webhook_data, dict):
348
+ converted_params["webhook"] = WebhookConfig(**webhook_data)
349
+ else:
350
+ converted_params["webhook"] = webhook_data
351
+
352
+ for camel_case, snake_case in field_mappings.items():
353
+ if camel_case in params_data:
354
+ if camel_case == "scrapeOptions" and params_data[camel_case] is not None:
355
+ # Handle nested scrapeOptions conversion
356
+ scrape_opts_data = params_data[camel_case]
357
+ converted_scrape_opts = {}
358
+ scrape_field_mappings = {
359
+ "includeTags": "include_tags",
360
+ "excludeTags": "exclude_tags",
361
+ "onlyMainContent": "only_main_content",
362
+ "waitFor": "wait_for",
363
+ "skipTlsVerification": "skip_tls_verification",
364
+ "removeBase64Images": "remove_base64_images"
365
+ }
366
+
367
+ for scrape_camel, scrape_snake in scrape_field_mappings.items():
368
+ if scrape_camel in scrape_opts_data:
369
+ converted_scrape_opts[scrape_snake] = scrape_opts_data[scrape_camel]
370
+
371
+ # Handle formats field - if it's a list, convert to ScrapeFormats
372
+ if "formats" in scrape_opts_data:
373
+ formats_data = scrape_opts_data["formats"]
374
+ if isinstance(formats_data, list):
375
+ # Convert list to ScrapeFormats object
376
+ from ..types import ScrapeFormats
377
+ converted_scrape_opts["formats"] = ScrapeFormats(formats=formats_data)
378
+ else:
379
+ converted_scrape_opts["formats"] = formats_data
380
+
381
+ # Add fields that don't need conversion
382
+ for key, value in scrape_opts_data.items():
383
+ if key not in scrape_field_mappings and key != "formats":
384
+ converted_scrape_opts[key] = value
385
+
386
+ converted_params[snake_case] = converted_scrape_opts
387
+ else:
388
+ converted_params[snake_case] = params_data[camel_case]
389
+
390
+ # Add fields that don't need conversion
391
+ for key, value in params_data.items():
392
+ if key not in field_mappings:
393
+ converted_params[key] = value
394
+
395
+ # Add warning if present
396
+ if "warning" in response_data:
397
+ converted_params["warning"] = response_data["warning"]
398
+
399
+ return CrawlParamsData(**converted_params)
400
+ else:
401
+ raise Exception(response_data.get("error", "Unknown error occurred"))
402
+
403
+
404
+ def get_crawl_errors(http_client: HttpClient, crawl_id: str) -> CrawlErrorsResponse:
405
+ """
406
+ Get errors from a crawl job.
407
+
408
+ Args:
409
+ http_client: HTTP client for making requests
410
+ crawl_id: The ID of the crawl job
411
+
412
+ Returns:
413
+ CrawlErrorsResponse containing errors and robots blocked URLs
414
+
415
+ Raises:
416
+ Exception: If the request fails
417
+ """
418
+ response = http_client.get(f"/v2/crawl/{crawl_id}/errors")
419
+
420
+ if not response.ok:
421
+ handle_response_error(response, "check crawl errors")
422
+
423
+ try:
424
+ body = response.json()
425
+ payload = body.get("data", body)
426
+ # Manual key normalization since we avoid Pydantic aliases
427
+ normalized = {
428
+ "errors": payload.get("errors", []),
429
+ "robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
430
+ }
431
+ return CrawlErrorsResponse(**normalized)
432
+ except Exception as e:
433
+ raise Exception(f"Failed to parse crawl errors response: {e}")
434
+
435
+
436
+ def get_active_crawls(client: HttpClient) -> ActiveCrawlsResponse:
437
+ """
438
+ Get a list of currently active crawl jobs.
439
+
440
+ Args:
441
+ client: HTTP client instance
442
+
443
+ Returns:
444
+ ActiveCrawlsResponse containing a list of active crawl jobs
445
+
446
+ Raises:
447
+ Exception: If the request fails
448
+ """
449
+ response = client.get("/v2/crawl/active")
450
+
451
+ if not response.ok:
452
+ handle_response_error(response, "get active crawls")
453
+
454
+ body = response.json()
455
+ if not body.get("success"):
456
+ raise Exception(body.get("error", "Unknown error occurred"))
457
+
458
+ crawls_in = body.get("crawls", [])
459
+ normalized_crawls = []
460
+ for c in crawls_in:
461
+ if isinstance(c, dict):
462
+ normalized_crawls.append({
463
+ "id": c.get("id"),
464
+ "team_id": c.get("teamId", c.get("team_id")),
465
+ "url": c.get("url"),
466
+ "options": c.get("options"),
467
+ })
468
+ return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized_crawls])
@@ -0,0 +1,131 @@
1
+ from typing import Any, Dict, List, Optional
2
+ import time
3
+
4
+ from ..types import ExtractResponse, ScrapeOptions
5
+ from ..utils.http_client import HttpClient
6
+ from ..utils.validation import prepare_scrape_options
7
+ from ..utils.error_handler import handle_response_error
8
+
9
+
10
+ def _prepare_extract_request(
11
+ urls: Optional[List[str]],
12
+ *,
13
+ prompt: Optional[str] = None,
14
+ schema: Optional[Dict[str, Any]] = None,
15
+ system_prompt: Optional[str] = None,
16
+ allow_external_links: Optional[bool] = None,
17
+ enable_web_search: Optional[bool] = None,
18
+ show_sources: Optional[bool] = None,
19
+ scrape_options: Optional[ScrapeOptions] = None,
20
+ ignore_invalid_urls: Optional[bool] = None,
21
+ ) -> Dict[str, Any]:
22
+ body: Dict[str, Any] = {}
23
+ if urls is not None:
24
+ body["urls"] = urls
25
+ if prompt is not None:
26
+ body["prompt"] = prompt
27
+ if schema is not None:
28
+ body["schema"] = schema
29
+ if system_prompt is not None:
30
+ body["systemPrompt"] = system_prompt
31
+ if allow_external_links is not None:
32
+ body["allowExternalLinks"] = allow_external_links
33
+ if enable_web_search is not None:
34
+ body["enableWebSearch"] = enable_web_search
35
+ if show_sources is not None:
36
+ body["showSources"] = show_sources
37
+ if ignore_invalid_urls is not None:
38
+ body["ignoreInvalidURLs"] = ignore_invalid_urls
39
+ if scrape_options is not None:
40
+ prepared = prepare_scrape_options(scrape_options)
41
+ if prepared:
42
+ body["scrapeOptions"] = prepared
43
+ return body
44
+
45
+
46
+ def start_extract(
47
+ client: HttpClient,
48
+ urls: Optional[List[str]],
49
+ *,
50
+ prompt: Optional[str] = None,
51
+ schema: Optional[Dict[str, Any]] = None,
52
+ system_prompt: Optional[str] = None,
53
+ allow_external_links: Optional[bool] = None,
54
+ enable_web_search: Optional[bool] = None,
55
+ show_sources: Optional[bool] = None,
56
+ scrape_options: Optional[ScrapeOptions] = None,
57
+ ignore_invalid_urls: Optional[bool] = None,
58
+ ) -> ExtractResponse:
59
+ body = _prepare_extract_request(
60
+ urls,
61
+ prompt=prompt,
62
+ schema=schema,
63
+ system_prompt=system_prompt,
64
+ allow_external_links=allow_external_links,
65
+ enable_web_search=enable_web_search,
66
+ show_sources=show_sources,
67
+ scrape_options=scrape_options,
68
+ ignore_invalid_urls=ignore_invalid_urls,
69
+ )
70
+ resp = client.post("/v2/extract", body)
71
+ if not resp.ok:
72
+ handle_response_error(resp, "extract")
73
+ return ExtractResponse(**resp.json())
74
+
75
+
76
+ def get_extract_status(client: HttpClient, job_id: str) -> ExtractResponse:
77
+ resp = client.get(f"/v2/extract/{job_id}")
78
+ if not resp.ok:
79
+ handle_response_error(resp, "extract-status")
80
+ return ExtractResponse(**resp.json())
81
+
82
+
83
+ def wait_extract(
84
+ client: HttpClient,
85
+ job_id: str,
86
+ *,
87
+ poll_interval: int = 2,
88
+ timeout: Optional[int] = None,
89
+ ) -> ExtractResponse:
90
+ start_ts = time.time()
91
+ while True:
92
+ status = get_extract_status(client, job_id)
93
+ if status.status in ("completed", "failed", "cancelled"):
94
+ return status
95
+ if timeout is not None and (time.time() - start_ts) > timeout:
96
+ return status
97
+ time.sleep(max(1, poll_interval))
98
+
99
+
100
+ def extract(
101
+ client: HttpClient,
102
+ urls: Optional[List[str]],
103
+ *,
104
+ prompt: Optional[str] = None,
105
+ schema: Optional[Dict[str, Any]] = None,
106
+ system_prompt: Optional[str] = None,
107
+ allow_external_links: Optional[bool] = None,
108
+ enable_web_search: Optional[bool] = None,
109
+ show_sources: Optional[bool] = None,
110
+ scrape_options: Optional[ScrapeOptions] = None,
111
+ ignore_invalid_urls: Optional[bool] = None,
112
+ poll_interval: int = 2,
113
+ timeout: Optional[int] = None,
114
+ ) -> ExtractResponse:
115
+ started = start_extract(
116
+ client,
117
+ urls,
118
+ prompt=prompt,
119
+ schema=schema,
120
+ system_prompt=system_prompt,
121
+ allow_external_links=allow_external_links,
122
+ enable_web_search=enable_web_search,
123
+ show_sources=show_sources,
124
+ scrape_options=scrape_options,
125
+ ignore_invalid_urls=ignore_invalid_urls,
126
+ )
127
+ job_id = getattr(started, "id", None)
128
+ if not job_id:
129
+ return started
130
+ return wait_extract(client, job_id, poll_interval=poll_interval, timeout=timeout)
131
+
@@ -0,0 +1,77 @@
1
+ """
2
+ Mapping functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ from typing import Optional, Dict, Any
6
+ from ..types import MapOptions, MapData, LinkResult
7
+ from ..utils import HttpClient, handle_response_error
8
+
9
+
10
+ def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict[str, Any]:
11
+ if not url or not url.strip():
12
+ raise ValueError("URL cannot be empty")
13
+
14
+ payload: Dict[str, Any] = {"url": url.strip()}
15
+
16
+ if options is not None:
17
+ # Unified sitemap parameter already provided in options
18
+ data: Dict[str, Any] = {}
19
+ if getattr(options, "sitemap", None) is not None:
20
+ data["sitemap"] = options.sitemap
21
+
22
+ if options.search is not None:
23
+ data["search"] = options.search
24
+ if options.include_subdomains is not None:
25
+ data["includeSubdomains"] = options.include_subdomains
26
+ if options.limit is not None:
27
+ data["limit"] = options.limit
28
+ if options.timeout is not None:
29
+ data["timeout"] = options.timeout
30
+ payload.update(data)
31
+
32
+ return payload
33
+
34
+
35
+ def map(client: HttpClient, url: str, options: Optional[MapOptions] = None) -> MapData:
36
+ """
37
+ Map a URL and return MapData (links list with optional titles/descriptions).
38
+ """
39
+ request_data = _prepare_map_request(url, options)
40
+ response = client.post("/v2/map", request_data)
41
+ if not response.ok:
42
+ handle_response_error(response, "map")
43
+
44
+ body = response.json()
45
+ if not body.get("success"):
46
+ raise Exception(body.get("error", "Unknown error occurred"))
47
+
48
+ # shouldnt return inside data?
49
+ # data = body.get("data", {})
50
+ # result_links: list[LinkResult] = []
51
+ # for item in data.get("links", []):
52
+ # if isinstance(item, dict):
53
+ # result_links.append(
54
+ # LinkResult(
55
+ # url=item.get("url", ""),
56
+ # title=item.get("title"),
57
+ # description=item.get("description"),
58
+ # )
59
+ # )
60
+ # elif isinstance(item, str):
61
+ # result_links.append(LinkResult(url=item))
62
+
63
+ result_links: list[LinkResult] = []
64
+ for item in body.get("links", []):
65
+ if isinstance(item, dict):
66
+ result_links.append(
67
+ LinkResult(
68
+ url=item.get("url", ""),
69
+ title=item.get("title"),
70
+ description=item.get("description"),
71
+ )
72
+ )
73
+ elif isinstance(item, str):
74
+ result_links.append(LinkResult(url=item))
75
+
76
+ return MapData(links=result_links)
77
+