firecrawl-py 3.2.1__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (85) hide show
  1. build/lib/firecrawl/__init__.py +87 -0
  2. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +188 -0
  4. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
  8. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. build/lib/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. build/lib/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. build/lib/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. build/lib/firecrawl/__tests__/e2e/v2/test_search.py +269 -0
  18. build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  36. build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. build/lib/firecrawl/client.py +242 -0
  41. build/lib/firecrawl/firecrawl.backup.py +4635 -0
  42. build/lib/firecrawl/types.py +161 -0
  43. build/lib/firecrawl/v1/__init__.py +14 -0
  44. build/lib/firecrawl/v1/client.py +4653 -0
  45. build/lib/firecrawl/v2/__init__.py +4 -0
  46. build/lib/firecrawl/v2/client.py +802 -0
  47. build/lib/firecrawl/v2/client_async.py +250 -0
  48. build/lib/firecrawl/v2/methods/aio/__init__.py +1 -0
  49. build/lib/firecrawl/v2/methods/aio/batch.py +85 -0
  50. build/lib/firecrawl/v2/methods/aio/crawl.py +171 -0
  51. build/lib/firecrawl/v2/methods/aio/extract.py +126 -0
  52. build/lib/firecrawl/v2/methods/aio/map.py +59 -0
  53. build/lib/firecrawl/v2/methods/aio/scrape.py +33 -0
  54. build/lib/firecrawl/v2/methods/aio/search.py +172 -0
  55. build/lib/firecrawl/v2/methods/aio/usage.py +42 -0
  56. build/lib/firecrawl/v2/methods/batch.py +417 -0
  57. build/lib/firecrawl/v2/methods/crawl.py +469 -0
  58. build/lib/firecrawl/v2/methods/extract.py +131 -0
  59. build/lib/firecrawl/v2/methods/map.py +77 -0
  60. build/lib/firecrawl/v2/methods/scrape.py +64 -0
  61. build/lib/firecrawl/v2/methods/search.py +197 -0
  62. build/lib/firecrawl/v2/methods/usage.py +41 -0
  63. build/lib/firecrawl/v2/types.py +665 -0
  64. build/lib/firecrawl/v2/utils/__init__.py +9 -0
  65. build/lib/firecrawl/v2/utils/error_handler.py +107 -0
  66. build/lib/firecrawl/v2/utils/get_version.py +15 -0
  67. build/lib/firecrawl/v2/utils/http_client.py +153 -0
  68. build/lib/firecrawl/v2/utils/http_client_async.py +65 -0
  69. build/lib/firecrawl/v2/utils/normalize.py +107 -0
  70. build/lib/firecrawl/v2/utils/validation.py +324 -0
  71. build/lib/firecrawl/v2/watcher.py +301 -0
  72. build/lib/firecrawl/v2/watcher_async.py +242 -0
  73. build/lib/tests/test_change_tracking.py +98 -0
  74. build/lib/tests/test_timeout_conversion.py +117 -0
  75. firecrawl/__init__.py +1 -1
  76. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +2 -2
  77. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +6 -6
  78. firecrawl/v2/methods/search.py +11 -0
  79. firecrawl/v2/types.py +30 -1
  80. {firecrawl_py-3.2.1.dist-info/licenses → firecrawl_py-3.3.0.dist-info}/LICENSE +0 -0
  81. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/METADATA +3 -7
  82. firecrawl_py-3.3.0.dist-info/RECORD +153 -0
  83. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/WHEEL +1 -1
  84. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/top_level.txt +2 -0
  85. firecrawl_py-3.2.1.dist-info/RECORD +0 -79
@@ -0,0 +1,469 @@
1
+ """
2
+ Crawling functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ import time
6
+ from typing import Optional, Dict, Any
7
+ from ..types import (
8
+ CrawlRequest,
9
+ CrawlJob,
10
+ CrawlResponse, Document, CrawlParamsRequest, CrawlParamsResponse, CrawlParamsData,
11
+ WebhookConfig, CrawlErrorsResponse, ActiveCrawlsResponse, ActiveCrawl
12
+ )
13
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
14
+ from ..utils.normalize import normalize_document_input
15
+
16
+
17
+ def _validate_crawl_request(request: CrawlRequest) -> None:
18
+ """
19
+ Validate crawl request parameters.
20
+
21
+ Args:
22
+ request: CrawlRequest to validate
23
+
24
+ Raises:
25
+ ValueError: If request is invalid
26
+ """
27
+ if not request.url or not request.url.strip():
28
+ raise ValueError("URL cannot be empty")
29
+
30
+ if request.limit is not None and request.limit <= 0:
31
+ raise ValueError("Limit must be positive")
32
+
33
+ # Validate scrape_options (if provided)
34
+ if request.scrape_options is not None:
35
+ validate_scrape_options(request.scrape_options)
36
+
37
+
38
+ def _prepare_crawl_request(request: CrawlRequest) -> dict:
39
+ """
40
+ Prepare crawl request for API submission.
41
+
42
+ Args:
43
+ request: CrawlRequest to prepare
44
+
45
+ Returns:
46
+ Dictionary ready for API submission
47
+ """
48
+ # Validate request
49
+ _validate_crawl_request(request)
50
+
51
+ # Start with basic data
52
+ data = {"url": request.url}
53
+
54
+ # Add prompt if present
55
+ if request.prompt:
56
+ data["prompt"] = request.prompt
57
+
58
+ # Handle scrape_options conversion first (before model_dump)
59
+ if request.scrape_options is not None:
60
+ scrape_data = prepare_scrape_options(request.scrape_options)
61
+ if scrape_data:
62
+ data["scrapeOptions"] = scrape_data
63
+
64
+ # Convert request to dict
65
+ request_data = request.model_dump(exclude_none=True, exclude_unset=True)
66
+
67
+ # Remove url, prompt, and scrape_options (already handled)
68
+ request_data.pop("url", None)
69
+ request_data.pop("prompt", None)
70
+ request_data.pop("scrape_options", None)
71
+
72
+ # Handle webhook conversion first (before model_dump)
73
+ if request.webhook is not None:
74
+ if isinstance(request.webhook, str):
75
+ data["webhook"] = request.webhook
76
+ else:
77
+ # Convert WebhookConfig to dict
78
+ data["webhook"] = request.webhook.model_dump(exclude_none=True)
79
+
80
+ # Convert other snake_case fields to camelCase
81
+ field_mappings = {
82
+ "include_paths": "includePaths",
83
+ "exclude_paths": "excludePaths",
84
+ "max_discovery_depth": "maxDiscoveryDepth",
85
+ "sitemap": "sitemap",
86
+ "ignore_query_parameters": "ignoreQueryParameters",
87
+ "crawl_entire_domain": "crawlEntireDomain",
88
+ "allow_external_links": "allowExternalLinks",
89
+ "allow_subdomains": "allowSubdomains",
90
+ "delay": "delay",
91
+ "max_concurrency": "maxConcurrency",
92
+ "zero_data_retention": "zeroDataRetention"
93
+ }
94
+
95
+ # Apply field mappings
96
+ for snake_case, camel_case in field_mappings.items():
97
+ if snake_case in request_data:
98
+ data[camel_case] = request_data.pop(snake_case)
99
+
100
+ # Add any remaining fields that don't need conversion (like limit)
101
+ data.update(request_data)
102
+
103
+ return data
104
+
105
+
106
+ def start_crawl(client: HttpClient, request: CrawlRequest) -> CrawlResponse:
107
+ """
108
+ Start a crawl job for a website.
109
+
110
+ Args:
111
+ client: HTTP client instance
112
+ request: CrawlRequest containing URL and options
113
+
114
+ Returns:
115
+ CrawlResponse with job information
116
+
117
+ Raises:
118
+ ValueError: If request is invalid
119
+ Exception: If the crawl operation fails to start
120
+ """
121
+ request_data = _prepare_crawl_request(request)
122
+
123
+ response = client.post("/v2/crawl", request_data)
124
+
125
+ if not response.ok:
126
+ handle_response_error(response, "start crawl")
127
+
128
+ response_data = response.json()
129
+
130
+ if response_data.get("success"):
131
+ job_data = {
132
+ "id": response_data.get("id"),
133
+ "url": response_data.get("url")
134
+ }
135
+
136
+ return CrawlResponse(**job_data)
137
+ else:
138
+ raise Exception(response_data.get("error", "Unknown error occurred"))
139
+
140
+
141
+ def get_crawl_status(client: HttpClient, job_id: str) -> CrawlJob:
142
+ """
143
+ Get the status of a crawl job.
144
+
145
+ Args:
146
+ client: HTTP client instance
147
+ job_id: ID of the crawl job
148
+
149
+ Returns:
150
+ CrawlJob with current status and data
151
+
152
+ Raises:
153
+ Exception: If the status check fails
154
+ """
155
+ # Make the API request
156
+ response = client.get(f"/v2/crawl/{job_id}")
157
+
158
+ # Handle errors
159
+ if not response.ok:
160
+ handle_response_error(response, "get crawl status")
161
+
162
+ # Parse response
163
+ response_data = response.json()
164
+
165
+ if response_data.get("success"):
166
+ # The API returns status fields at the top level, not in a data field
167
+
168
+ # Convert documents
169
+ documents = []
170
+ data_list = response_data.get("data", [])
171
+ for doc_data in data_list:
172
+ if isinstance(doc_data, str):
173
+ # Handle case where API returns just URLs - this shouldn't happen for crawl
174
+ # but we'll handle it gracefully
175
+ continue
176
+ else:
177
+ documents.append(Document(**normalize_document_input(doc_data)))
178
+
179
+ # Create CrawlJob with current status and data
180
+ return CrawlJob(
181
+ status=response_data.get("status"),
182
+ completed=response_data.get("completed", 0),
183
+ total=response_data.get("total", 0),
184
+ credits_used=response_data.get("creditsUsed", 0),
185
+ expires_at=response_data.get("expiresAt"),
186
+ next=response_data.get("next", None),
187
+ data=documents
188
+ )
189
+ else:
190
+ raise Exception(response_data.get("error", "Unknown error occurred"))
191
+
192
+
193
+ def cancel_crawl(client: HttpClient, job_id: str) -> bool:
194
+ """
195
+ Cancel a running crawl job.
196
+
197
+ Args:
198
+ client: HTTP client instance
199
+ job_id: ID of the crawl job to cancel
200
+
201
+ Returns:
202
+ bool: True if the crawl was cancelled, False otherwise
203
+
204
+ Raises:
205
+ Exception: If the cancellation fails
206
+ """
207
+ response = client.delete(f"/v2/crawl/{job_id}")
208
+
209
+ if not response.ok:
210
+ handle_response_error(response, "cancel crawl")
211
+
212
+ response_data = response.json()
213
+
214
+ return response_data.get("status") == "cancelled"
215
+
216
+ def wait_for_crawl_completion(
217
+ client: HttpClient,
218
+ job_id: str,
219
+ poll_interval: int = 2,
220
+ timeout: Optional[int] = None
221
+ ) -> CrawlJob:
222
+ """
223
+ Wait for a crawl job to complete, polling for status updates.
224
+
225
+ Args:
226
+ client: HTTP client instance
227
+ job_id: ID of the crawl job
228
+ poll_interval: Seconds between status checks
229
+ timeout: Maximum seconds to wait (None for no timeout)
230
+
231
+ Returns:
232
+ CrawlJob when job completes
233
+
234
+ Raises:
235
+ Exception: If the job fails
236
+ TimeoutError: If timeout is reached
237
+ """
238
+ start_time = time.time()
239
+
240
+ while True:
241
+ crawl_job = get_crawl_status(client, job_id)
242
+
243
+ # Check if job is complete
244
+ if crawl_job.status in ["completed", "failed"]:
245
+ return crawl_job
246
+
247
+ # Check timeout
248
+ if timeout and (time.time() - start_time) > timeout:
249
+ raise TimeoutError(f"Crawl job {job_id} did not complete within {timeout} seconds")
250
+
251
+ # Wait before next poll
252
+ time.sleep(poll_interval)
253
+
254
+
255
+ def crawl(
256
+ client: HttpClient,
257
+ request: CrawlRequest,
258
+ poll_interval: int = 2,
259
+ timeout: Optional[int] = None
260
+ ) -> CrawlJob:
261
+ """
262
+ Start a crawl job and wait for it to complete.
263
+
264
+ Args:
265
+ client: HTTP client instance
266
+ request: CrawlRequest containing URL and options
267
+ poll_interval: Seconds between status checks
268
+ timeout: Maximum seconds to wait (None for no timeout)
269
+
270
+ Returns:
271
+ CrawlJob when job completes
272
+
273
+ Raises:
274
+ ValueError: If request is invalid
275
+ Exception: If the crawl fails to start or complete
276
+ TimeoutError: If timeout is reached
277
+ """
278
+ # Start the crawl
279
+ crawl_job = start_crawl(client, request)
280
+ job_id = crawl_job.id
281
+
282
+ # Wait for completion
283
+ return wait_for_crawl_completion(
284
+ client, job_id, poll_interval, timeout
285
+ )
286
+
287
+
288
+ def crawl_params_preview(client: HttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
289
+ """
290
+ Get crawl parameters from LLM based on URL and prompt.
291
+
292
+ Args:
293
+ client: HTTP client instance
294
+ request: CrawlParamsRequest containing URL and prompt
295
+
296
+ Returns:
297
+ CrawlParamsData containing suggested crawl options
298
+
299
+ Raises:
300
+ ValueError: If request is invalid
301
+ Exception: If the operation fails
302
+ """
303
+ # Validate request
304
+ if not request.url or not request.url.strip():
305
+ raise ValueError("URL cannot be empty")
306
+
307
+ if not request.prompt or not request.prompt.strip():
308
+ raise ValueError("Prompt cannot be empty")
309
+
310
+ # Prepare request data
311
+ request_data = {
312
+ "url": request.url,
313
+ "prompt": request.prompt
314
+ }
315
+
316
+ # Make the API request
317
+ response = client.post("/v2/crawl/params-preview", request_data)
318
+
319
+ # Handle errors
320
+ if not response.ok:
321
+ handle_response_error(response, "crawl params preview")
322
+
323
+ # Parse response
324
+ response_data = response.json()
325
+
326
+ if response_data.get("success"):
327
+ params_data = response_data.get("data", {})
328
+
329
+ # Convert camelCase to snake_case for CrawlParamsData
330
+ converted_params = {}
331
+ field_mappings = {
332
+ "includePaths": "include_paths",
333
+ "excludePaths": "exclude_paths",
334
+ "maxDiscoveryDepth": "max_discovery_depth",
335
+ "sitemap": "sitemap",
336
+ "ignoreQueryParameters": "ignore_query_parameters",
337
+ "crawlEntireDomain": "crawl_entire_domain",
338
+ "allowExternalLinks": "allow_external_links",
339
+ "allowSubdomains": "allow_subdomains",
340
+ "maxConcurrency": "max_concurrency",
341
+ "scrapeOptions": "scrape_options",
342
+ "zeroDataRetention": "zero_data_retention"
343
+ }
344
+
345
+ # Handle webhook conversion
346
+ if "webhook" in params_data:
347
+ webhook_data = params_data["webhook"]
348
+ if isinstance(webhook_data, dict):
349
+ converted_params["webhook"] = WebhookConfig(**webhook_data)
350
+ else:
351
+ converted_params["webhook"] = webhook_data
352
+
353
+ for camel_case, snake_case in field_mappings.items():
354
+ if camel_case in params_data:
355
+ if camel_case == "scrapeOptions" and params_data[camel_case] is not None:
356
+ # Handle nested scrapeOptions conversion
357
+ scrape_opts_data = params_data[camel_case]
358
+ converted_scrape_opts = {}
359
+ scrape_field_mappings = {
360
+ "includeTags": "include_tags",
361
+ "excludeTags": "exclude_tags",
362
+ "onlyMainContent": "only_main_content",
363
+ "waitFor": "wait_for",
364
+ "skipTlsVerification": "skip_tls_verification",
365
+ "removeBase64Images": "remove_base64_images"
366
+ }
367
+
368
+ for scrape_camel, scrape_snake in scrape_field_mappings.items():
369
+ if scrape_camel in scrape_opts_data:
370
+ converted_scrape_opts[scrape_snake] = scrape_opts_data[scrape_camel]
371
+
372
+ # Handle formats field - if it's a list, convert to ScrapeFormats
373
+ if "formats" in scrape_opts_data:
374
+ formats_data = scrape_opts_data["formats"]
375
+ if isinstance(formats_data, list):
376
+ # Convert list to ScrapeFormats object
377
+ from ..types import ScrapeFormats
378
+ converted_scrape_opts["formats"] = ScrapeFormats(formats=formats_data)
379
+ else:
380
+ converted_scrape_opts["formats"] = formats_data
381
+
382
+ # Add fields that don't need conversion
383
+ for key, value in scrape_opts_data.items():
384
+ if key not in scrape_field_mappings and key != "formats":
385
+ converted_scrape_opts[key] = value
386
+
387
+ converted_params[snake_case] = converted_scrape_opts
388
+ else:
389
+ converted_params[snake_case] = params_data[camel_case]
390
+
391
+ # Add fields that don't need conversion
392
+ for key, value in params_data.items():
393
+ if key not in field_mappings:
394
+ converted_params[key] = value
395
+
396
+ # Add warning if present
397
+ if "warning" in response_data:
398
+ converted_params["warning"] = response_data["warning"]
399
+
400
+ return CrawlParamsData(**converted_params)
401
+ else:
402
+ raise Exception(response_data.get("error", "Unknown error occurred"))
403
+
404
+
405
+ def get_crawl_errors(http_client: HttpClient, crawl_id: str) -> CrawlErrorsResponse:
406
+ """
407
+ Get errors from a crawl job.
408
+
409
+ Args:
410
+ http_client: HTTP client for making requests
411
+ crawl_id: The ID of the crawl job
412
+
413
+ Returns:
414
+ CrawlErrorsResponse containing errors and robots blocked URLs
415
+
416
+ Raises:
417
+ Exception: If the request fails
418
+ """
419
+ response = http_client.get(f"/v2/crawl/{crawl_id}/errors")
420
+
421
+ if not response.ok:
422
+ handle_response_error(response, "check crawl errors")
423
+
424
+ try:
425
+ body = response.json()
426
+ payload = body.get("data", body)
427
+ # Manual key normalization since we avoid Pydantic aliases
428
+ normalized = {
429
+ "errors": payload.get("errors", []),
430
+ "robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
431
+ }
432
+ return CrawlErrorsResponse(**normalized)
433
+ except Exception as e:
434
+ raise Exception(f"Failed to parse crawl errors response: {e}")
435
+
436
+
437
+ def get_active_crawls(client: HttpClient) -> ActiveCrawlsResponse:
438
+ """
439
+ Get a list of currently active crawl jobs.
440
+
441
+ Args:
442
+ client: HTTP client instance
443
+
444
+ Returns:
445
+ ActiveCrawlsResponse containing a list of active crawl jobs
446
+
447
+ Raises:
448
+ Exception: If the request fails
449
+ """
450
+ response = client.get("/v2/crawl/active")
451
+
452
+ if not response.ok:
453
+ handle_response_error(response, "get active crawls")
454
+
455
+ body = response.json()
456
+ if not body.get("success"):
457
+ raise Exception(body.get("error", "Unknown error occurred"))
458
+
459
+ crawls_in = body.get("crawls", [])
460
+ normalized_crawls = []
461
+ for c in crawls_in:
462
+ if isinstance(c, dict):
463
+ normalized_crawls.append({
464
+ "id": c.get("id"),
465
+ "team_id": c.get("teamId", c.get("team_id")),
466
+ "url": c.get("url"),
467
+ "options": c.get("options"),
468
+ })
469
+ return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized_crawls])
@@ -0,0 +1,131 @@
1
+ from typing import Any, Dict, List, Optional
2
+ import time
3
+
4
+ from ..types import ExtractResponse, ScrapeOptions
5
+ from ..utils.http_client import HttpClient
6
+ from ..utils.validation import prepare_scrape_options
7
+ from ..utils.error_handler import handle_response_error
8
+
9
+
10
+ def _prepare_extract_request(
11
+ urls: Optional[List[str]],
12
+ *,
13
+ prompt: Optional[str] = None,
14
+ schema: Optional[Dict[str, Any]] = None,
15
+ system_prompt: Optional[str] = None,
16
+ allow_external_links: Optional[bool] = None,
17
+ enable_web_search: Optional[bool] = None,
18
+ show_sources: Optional[bool] = None,
19
+ scrape_options: Optional[ScrapeOptions] = None,
20
+ ignore_invalid_urls: Optional[bool] = None,
21
+ ) -> Dict[str, Any]:
22
+ body: Dict[str, Any] = {}
23
+ if urls is not None:
24
+ body["urls"] = urls
25
+ if prompt is not None:
26
+ body["prompt"] = prompt
27
+ if schema is not None:
28
+ body["schema"] = schema
29
+ if system_prompt is not None:
30
+ body["systemPrompt"] = system_prompt
31
+ if allow_external_links is not None:
32
+ body["allowExternalLinks"] = allow_external_links
33
+ if enable_web_search is not None:
34
+ body["enableWebSearch"] = enable_web_search
35
+ if show_sources is not None:
36
+ body["showSources"] = show_sources
37
+ if ignore_invalid_urls is not None:
38
+ body["ignoreInvalidURLs"] = ignore_invalid_urls
39
+ if scrape_options is not None:
40
+ prepared = prepare_scrape_options(scrape_options)
41
+ if prepared:
42
+ body["scrapeOptions"] = prepared
43
+ return body
44
+
45
+
46
+ def start_extract(
47
+ client: HttpClient,
48
+ urls: Optional[List[str]],
49
+ *,
50
+ prompt: Optional[str] = None,
51
+ schema: Optional[Dict[str, Any]] = None,
52
+ system_prompt: Optional[str] = None,
53
+ allow_external_links: Optional[bool] = None,
54
+ enable_web_search: Optional[bool] = None,
55
+ show_sources: Optional[bool] = None,
56
+ scrape_options: Optional[ScrapeOptions] = None,
57
+ ignore_invalid_urls: Optional[bool] = None,
58
+ ) -> ExtractResponse:
59
+ body = _prepare_extract_request(
60
+ urls,
61
+ prompt=prompt,
62
+ schema=schema,
63
+ system_prompt=system_prompt,
64
+ allow_external_links=allow_external_links,
65
+ enable_web_search=enable_web_search,
66
+ show_sources=show_sources,
67
+ scrape_options=scrape_options,
68
+ ignore_invalid_urls=ignore_invalid_urls,
69
+ )
70
+ resp = client.post("/v2/extract", body)
71
+ if not resp.ok:
72
+ handle_response_error(resp, "extract")
73
+ return ExtractResponse(**resp.json())
74
+
75
+
76
+ def get_extract_status(client: HttpClient, job_id: str) -> ExtractResponse:
77
+ resp = client.get(f"/v2/extract/{job_id}")
78
+ if not resp.ok:
79
+ handle_response_error(resp, "extract-status")
80
+ return ExtractResponse(**resp.json())
81
+
82
+
83
+ def wait_extract(
84
+ client: HttpClient,
85
+ job_id: str,
86
+ *,
87
+ poll_interval: int = 2,
88
+ timeout: Optional[int] = None,
89
+ ) -> ExtractResponse:
90
+ start_ts = time.time()
91
+ while True:
92
+ status = get_extract_status(client, job_id)
93
+ if status.status in ("completed", "failed", "cancelled"):
94
+ return status
95
+ if timeout is not None and (time.time() - start_ts) > timeout:
96
+ return status
97
+ time.sleep(max(1, poll_interval))
98
+
99
+
100
+ def extract(
101
+ client: HttpClient,
102
+ urls: Optional[List[str]],
103
+ *,
104
+ prompt: Optional[str] = None,
105
+ schema: Optional[Dict[str, Any]] = None,
106
+ system_prompt: Optional[str] = None,
107
+ allow_external_links: Optional[bool] = None,
108
+ enable_web_search: Optional[bool] = None,
109
+ show_sources: Optional[bool] = None,
110
+ scrape_options: Optional[ScrapeOptions] = None,
111
+ ignore_invalid_urls: Optional[bool] = None,
112
+ poll_interval: int = 2,
113
+ timeout: Optional[int] = None,
114
+ ) -> ExtractResponse:
115
+ started = start_extract(
116
+ client,
117
+ urls,
118
+ prompt=prompt,
119
+ schema=schema,
120
+ system_prompt=system_prompt,
121
+ allow_external_links=allow_external_links,
122
+ enable_web_search=enable_web_search,
123
+ show_sources=show_sources,
124
+ scrape_options=scrape_options,
125
+ ignore_invalid_urls=ignore_invalid_urls,
126
+ )
127
+ job_id = getattr(started, "id", None)
128
+ if not job_id:
129
+ return started
130
+ return wait_extract(client, job_id, poll_interval=poll_interval, timeout=timeout)
131
+
@@ -0,0 +1,77 @@
1
+ """
2
+ Mapping functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ from typing import Optional, Dict, Any
6
+ from ..types import MapOptions, MapData, LinkResult
7
+ from ..utils import HttpClient, handle_response_error
8
+
9
+
10
+ def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict[str, Any]:
11
+ if not url or not url.strip():
12
+ raise ValueError("URL cannot be empty")
13
+
14
+ payload: Dict[str, Any] = {"url": url.strip()}
15
+
16
+ if options is not None:
17
+ # Unified sitemap parameter already provided in options
18
+ data: Dict[str, Any] = {}
19
+ if getattr(options, "sitemap", None) is not None:
20
+ data["sitemap"] = options.sitemap
21
+
22
+ if options.search is not None:
23
+ data["search"] = options.search
24
+ if options.include_subdomains is not None:
25
+ data["includeSubdomains"] = options.include_subdomains
26
+ if options.limit is not None:
27
+ data["limit"] = options.limit
28
+ if options.timeout is not None:
29
+ data["timeout"] = options.timeout
30
+ payload.update(data)
31
+
32
+ return payload
33
+
34
+
35
+ def map(client: HttpClient, url: str, options: Optional[MapOptions] = None) -> MapData:
36
+ """
37
+ Map a URL and return MapData (links list with optional titles/descriptions).
38
+ """
39
+ request_data = _prepare_map_request(url, options)
40
+ response = client.post("/v2/map", request_data)
41
+ if not response.ok:
42
+ handle_response_error(response, "map")
43
+
44
+ body = response.json()
45
+ if not body.get("success"):
46
+ raise Exception(body.get("error", "Unknown error occurred"))
47
+
48
+ # shouldnt return inside data?
49
+ # data = body.get("data", {})
50
+ # result_links: list[LinkResult] = []
51
+ # for item in data.get("links", []):
52
+ # if isinstance(item, dict):
53
+ # result_links.append(
54
+ # LinkResult(
55
+ # url=item.get("url", ""),
56
+ # title=item.get("title"),
57
+ # description=item.get("description"),
58
+ # )
59
+ # )
60
+ # elif isinstance(item, str):
61
+ # result_links.append(LinkResult(url=item))
62
+
63
+ result_links: list[LinkResult] = []
64
+ for item in body.get("links", []):
65
+ if isinstance(item, dict):
66
+ result_links.append(
67
+ LinkResult(
68
+ url=item.get("url", ""),
69
+ title=item.get("title"),
70
+ description=item.get("description"),
71
+ )
72
+ )
73
+ elif isinstance(item, str):
74
+ result_links.append(LinkResult(url=item))
75
+
76
+ return MapData(links=result_links)
77
+