firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,4635 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+
28
+ def get_version():
29
+ try:
30
+ from pathlib import Path
31
+ package_path = os.path.dirname(__file__)
32
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
33
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
34
+ if version_match:
35
+ return version_match.group(1).strip()
36
+ except Exception:
37
+ print("Failed to get version from __init__.py")
38
+ return None
39
+
40
+ version = get_version()
41
+
42
+ logger : logging.Logger = logging.getLogger("firecrawl")
43
+
44
+ T = TypeVar('T')
45
+
46
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
47
+ # """Metadata for a Firecrawl document."""
48
+ # title: Optional[str] = None
49
+ # description: Optional[str] = None
50
+ # language: Optional[str] = None
51
+ # keywords: Optional[str] = None
52
+ # robots: Optional[str] = None
53
+ # ogTitle: Optional[str] = None
54
+ # ogDescription: Optional[str] = None
55
+ # ogUrl: Optional[str] = None
56
+ # ogImage: Optional[str] = None
57
+ # ogAudio: Optional[str] = None
58
+ # ogDeterminer: Optional[str] = None
59
+ # ogLocale: Optional[str] = None
60
+ # ogLocaleAlternate: Optional[List[str]] = None
61
+ # ogSiteName: Optional[str] = None
62
+ # ogVideo: Optional[str] = None
63
+ # dctermsCreated: Optional[str] = None
64
+ # dcDateCreated: Optional[str] = None
65
+ # dcDate: Optional[str] = None
66
+ # dctermsType: Optional[str] = None
67
+ # dcType: Optional[str] = None
68
+ # dctermsAudience: Optional[str] = None
69
+ # dctermsSubject: Optional[str] = None
70
+ # dcSubject: Optional[str] = None
71
+ # dcDescription: Optional[str] = None
72
+ # dctermsKeywords: Optional[str] = None
73
+ # modifiedTime: Optional[str] = None
74
+ # publishedTime: Optional[str] = None
75
+ # articleTag: Optional[str] = None
76
+ # articleSection: Optional[str] = None
77
+ # sourceURL: Optional[str] = None
78
+ # statusCode: Optional[int] = None
79
+ # error: Optional[str] = None
80
+
81
+ class AgentOptions(pydantic.BaseModel):
82
+ """Configuration for the agent."""
83
+ model: Literal["FIRE-1"] = "FIRE-1"
84
+ prompt: Optional[str] = None
85
+
86
+ class AgentOptionsExtract(pydantic.BaseModel):
87
+ """Configuration for the agent in extract operations."""
88
+ model: Literal["FIRE-1"] = "FIRE-1"
89
+
90
+ class ActionsResult(pydantic.BaseModel):
91
+ """Result of actions performed during scraping."""
92
+ screenshots: List[str]
93
+ pdfs: List[str]
94
+
95
+ class ChangeTrackingData(pydantic.BaseModel):
96
+ """
97
+ Data for the change tracking format.
98
+ """
99
+ previousScrapeAt: Optional[str] = None
100
+ changeStatus: str # "new" | "same" | "changed" | "removed"
101
+ visibility: str # "visible" | "hidden"
102
+ diff: Optional[Dict[str, Any]] = None
103
+ json_field: Optional[Any] = pydantic.Field(None, alias='json')
104
+
105
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
106
+ """Document retrieved or processed by Firecrawl."""
107
+ url: Optional[str] = None
108
+ markdown: Optional[str] = None
109
+ html: Optional[str] = None
110
+ rawHtml: Optional[str] = None
111
+ links: Optional[List[str]] = None
112
+ extract: Optional[T] = None
113
+ json_field: Optional[T] = pydantic.Field(None, alias='json')
114
+ screenshot: Optional[str] = None
115
+ metadata: Optional[Any] = None
116
+ actions: Optional[ActionsResult] = None
117
+ title: Optional[str] = None # v1 search only
118
+ description: Optional[str] = None # v1 search only
119
+ changeTracking: Optional[ChangeTrackingData] = None
120
+
121
+ class LocationConfig(pydantic.BaseModel):
122
+ """Location configuration for scraping."""
123
+ country: Optional[str] = None
124
+ languages: Optional[List[str]] = None
125
+
126
+ class WebhookConfig(pydantic.BaseModel):
127
+ """Configuration for webhooks."""
128
+ url: str
129
+ headers: Optional[Dict[str, str]] = None
130
+ metadata: Optional[Dict[str, str]] = None
131
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
132
+
133
+ class ChangeTrackingOptions(pydantic.BaseModel):
134
+ """Configuration for change tracking."""
135
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
136
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
137
+ prompt: Optional[str] = None
138
+ tag: Optional[str] = None
139
+
140
+ class ScrapeOptions(pydantic.BaseModel):
141
+ """Parameters for scraping operations."""
142
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
143
+ headers: Optional[Dict[str, str]] = None
144
+ includeTags: Optional[List[str]] = None
145
+ excludeTags: Optional[List[str]] = None
146
+ onlyMainContent: Optional[bool] = None
147
+ waitFor: Optional[int] = None
148
+ timeout: Optional[int] = 30000
149
+ location: Optional[LocationConfig] = None
150
+ mobile: Optional[bool] = None
151
+ skipTlsVerification: Optional[bool] = None
152
+ removeBase64Images: Optional[bool] = None
153
+ blockAds: Optional[bool] = None
154
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
155
+ changeTrackingOptions: Optional[ChangeTrackingOptions] = None
156
+ maxAge: Optional[int] = None
157
+ storeInCache: Optional[bool] = None
158
+ parsePDF: Optional[bool] = None
159
+
160
+ class WaitAction(pydantic.BaseModel):
161
+ """Wait action to perform during scraping."""
162
+ type: Literal["wait"]
163
+ milliseconds: Optional[int] = None
164
+ selector: Optional[str] = None
165
+
166
+ class ScreenshotAction(pydantic.BaseModel):
167
+ """Screenshot action to perform during scraping."""
168
+ type: Literal["screenshot"]
169
+ fullPage: Optional[bool] = None
170
+ quality: Optional[int] = None
171
+
172
+ class ClickAction(pydantic.BaseModel):
173
+ """Click action to perform during scraping."""
174
+ type: Literal["click"]
175
+ selector: str
176
+
177
+ class WriteAction(pydantic.BaseModel):
178
+ """Write action to perform during scraping."""
179
+ type: Literal["write"]
180
+ text: str
181
+
182
+ class PressAction(pydantic.BaseModel):
183
+ """Press action to perform during scraping."""
184
+ type: Literal["press"]
185
+ key: str
186
+
187
+ class ScrollAction(pydantic.BaseModel):
188
+ """Scroll action to perform during scraping."""
189
+ type: Literal["scroll"]
190
+ direction: Literal["up", "down"]
191
+ selector: Optional[str] = None
192
+
193
+ class ScrapeAction(pydantic.BaseModel):
194
+ """Scrape action to perform during scraping."""
195
+ type: Literal["scrape"]
196
+
197
+ class ExecuteJavascriptAction(pydantic.BaseModel):
198
+ """Execute javascript action to perform during scraping."""
199
+ type: Literal["executeJavascript"]
200
+ script: str
201
+
202
+ class PDFAction(pydantic.BaseModel):
203
+ """PDF action to perform during scraping."""
204
+ type: Literal["pdf"]
205
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
206
+ landscape: Optional[bool] = None
207
+ scale: Optional[float] = None
208
+
209
+ class ExtractAgent(pydantic.BaseModel):
210
+ """Configuration for the agent in extract operations."""
211
+ model: Literal["FIRE-1"] = "FIRE-1"
212
+
213
+ class JsonConfig(pydantic.BaseModel):
214
+ """Configuration for extraction."""
215
+ prompt: Optional[str] = None
216
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
217
+ systemPrompt: Optional[str] = None
218
+ agent: Optional[ExtractAgent] = None
219
+
220
+ class ScrapeParams(ScrapeOptions):
221
+ """Parameters for scraping operations."""
222
+ extract: Optional[JsonConfig] = None
223
+ jsonOptions: Optional[JsonConfig] = None
224
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
225
+ agent: Optional[AgentOptions] = None
226
+ webhook: Optional[WebhookConfig] = None
227
+
228
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
229
+ """Response from scraping operations."""
230
+ success: bool = True
231
+ warning: Optional[str] = None
232
+ error: Optional[str] = None
233
+
234
+ class BatchScrapeResponse(pydantic.BaseModel):
235
+ """Response from batch scrape operations."""
236
+ id: Optional[str] = None
237
+ url: Optional[str] = None
238
+ success: bool = True
239
+ error: Optional[str] = None
240
+ invalidURLs: Optional[List[str]] = None
241
+
242
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
243
+ """Response from batch scrape status checks."""
244
+ success: bool = True
245
+ status: Literal["scraping", "completed", "failed", "cancelled"]
246
+ completed: int
247
+ total: int
248
+ creditsUsed: int
249
+ expiresAt: datetime
250
+ next: Optional[str] = None
251
+ data: List[FirecrawlDocument]
252
+
253
+ class CrawlParams(pydantic.BaseModel):
254
+ """Parameters for crawling operations."""
255
+ includePaths: Optional[List[str]] = None
256
+ excludePaths: Optional[List[str]] = None
257
+ maxDepth: Optional[int] = None
258
+ maxDiscoveryDepth: Optional[int] = None
259
+ limit: Optional[int] = None
260
+ allowBackwardLinks: Optional[bool] = None
261
+ crawlEntireDomain: Optional[bool] = None
262
+ allowExternalLinks: Optional[bool] = None
263
+ ignoreSitemap: Optional[bool] = None
264
+ scrapeOptions: Optional[ScrapeOptions] = None
265
+ webhook: Optional[Union[str, WebhookConfig]] = None
266
+ deduplicateSimilarURLs: Optional[bool] = None
267
+ ignoreQueryParameters: Optional[bool] = None
268
+ regexOnFullURL: Optional[bool] = None
269
+ delay: Optional[int] = None # Delay in seconds between scrapes
270
+ maxConcurrency: Optional[int] = None
271
+ allowSubdomains: Optional[bool] = None
272
+
273
+ class CrawlResponse(pydantic.BaseModel):
274
+ """Response from crawling operations."""
275
+ id: Optional[str] = None
276
+ url: Optional[str] = None
277
+ success: bool = True
278
+ error: Optional[str] = None
279
+
280
+ class CrawlStatusResponse(pydantic.BaseModel):
281
+ """Response from crawl status checks."""
282
+ success: bool = True
283
+ status: Literal["scraping", "completed", "failed", "cancelled"]
284
+ completed: int
285
+ total: int
286
+ creditsUsed: int
287
+ expiresAt: datetime
288
+ next: Optional[str] = None
289
+ data: List[FirecrawlDocument]
290
+
291
+ class CrawlErrorsResponse(pydantic.BaseModel):
292
+ """Response from crawl/batch scrape error monitoring."""
293
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
294
+ robotsBlocked: List[str]
295
+
296
+ class MapParams(pydantic.BaseModel):
297
+ """Parameters for mapping operations."""
298
+ search: Optional[str] = None
299
+ ignoreSitemap: Optional[bool] = None
300
+ includeSubdomains: Optional[bool] = None
301
+ sitemapOnly: Optional[bool] = None
302
+ limit: Optional[int] = None
303
+ timeout: Optional[int] = 30000
304
+ useIndex: Optional[bool] = None
305
+
306
+ class MapResponse(pydantic.BaseModel):
307
+ """Response from mapping operations."""
308
+ success: bool = True
309
+ links: Optional[List[str]] = None
310
+ error: Optional[str] = None
311
+
312
+ class ExtractParams(pydantic.BaseModel):
313
+ """Parameters for extracting information from URLs."""
314
+ prompt: Optional[str] = None
315
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
316
+ systemPrompt: Optional[str] = None
317
+ allowExternalLinks: Optional[bool] = None
318
+ enableWebSearch: Optional[bool] = None
319
+ includeSubdomains: Optional[bool] = None
320
+ origin: Optional[str] = None
321
+ showSources: Optional[bool] = None
322
+ scrapeOptions: Optional[ScrapeOptions] = None
323
+
324
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
325
+ """Response from extract operations."""
326
+ id: Optional[str] = None
327
+ status: Optional[Literal["processing", "completed", "failed"]] = None
328
+ expiresAt: Optional[datetime] = None
329
+ success: bool = True
330
+ data: Optional[T] = None
331
+ error: Optional[str] = None
332
+ warning: Optional[str] = None
333
+ sources: Optional[Dict[Any, Any]] = None
334
+
335
+ class SearchParams(pydantic.BaseModel):
336
+ query: str
337
+ limit: Optional[int] = 5
338
+ tbs: Optional[str] = None
339
+ filter: Optional[str] = None
340
+ lang: Optional[str] = "en"
341
+ country: Optional[str] = "us"
342
+ location: Optional[str] = None
343
+ origin: Optional[str] = "api"
344
+ timeout: Optional[int] = 60000
345
+ scrapeOptions: Optional[ScrapeOptions] = None
346
+
347
+ class SearchResponse(pydantic.BaseModel):
348
+ """Response from search operations."""
349
+ success: bool = True
350
+ data: List[FirecrawlDocument]
351
+ warning: Optional[str] = None
352
+ error: Optional[str] = None
353
+
354
+ class GenerateLLMsTextParams(pydantic.BaseModel):
355
+ """
356
+ Parameters for the LLMs.txt generation operation.
357
+ """
358
+ maxUrls: Optional[int] = 10
359
+ showFullText: Optional[bool] = False
360
+ cache: Optional[bool] = True
361
+ __experimental_stream: Optional[bool] = None
362
+
363
+ class DeepResearchParams(pydantic.BaseModel):
364
+ """
365
+ Parameters for the deep research operation.
366
+ """
367
+ maxDepth: Optional[int] = 7
368
+ timeLimit: Optional[int] = 270
369
+ maxUrls: Optional[int] = 20
370
+ analysisPrompt: Optional[str] = None
371
+ systemPrompt: Optional[str] = None
372
+ __experimental_streamSteps: Optional[bool] = None
373
+
374
+ class DeepResearchResponse(pydantic.BaseModel):
375
+ """
376
+ Response from the deep research operation.
377
+ """
378
+ success: bool
379
+ id: str
380
+ error: Optional[str] = None
381
+
382
+ class DeepResearchStatusResponse(pydantic.BaseModel):
383
+ """
384
+ Status response from the deep research operation.
385
+ """
386
+ success: bool
387
+ data: Optional[Dict[str, Any]] = None
388
+ status: str
389
+ error: Optional[str] = None
390
+ expiresAt: str
391
+ currentDepth: int
392
+ maxDepth: int
393
+ activities: List[Dict[str, Any]]
394
+ sources: List[Dict[str, Any]]
395
+ summaries: List[str]
396
+
397
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
398
+ """Response from LLMs.txt generation operations."""
399
+ success: bool = True
400
+ id: str
401
+ error: Optional[str] = None
402
+
403
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
404
+ llmstxt: str
405
+ llmsfulltxt: Optional[str] = None
406
+
407
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
408
+ """Status response from LLMs.txt generation operations."""
409
+ success: bool = True
410
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
411
+ status: Literal["processing", "completed", "failed"]
412
+ error: Optional[str] = None
413
+ expiresAt: str
414
+
415
+ class SearchResponse(pydantic.BaseModel):
416
+ """
417
+ Response from the search operation.
418
+ """
419
+ success: bool
420
+ data: List[Dict[str, Any]]
421
+ warning: Optional[str] = None
422
+ error: Optional[str] = None
423
+
424
+ class ExtractParams(pydantic.BaseModel):
425
+ """
426
+ Parameters for the extract operation.
427
+ """
428
+ prompt: Optional[str] = None
429
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
430
+ system_prompt: Optional[str] = None
431
+ allow_external_links: Optional[bool] = False
432
+ enable_web_search: Optional[bool] = False
433
+ # Just for backwards compatibility
434
+ enableWebSearch: Optional[bool] = False
435
+ show_sources: Optional[bool] = False
436
+ agent: Optional[Dict[str, Any]] = None
437
+
438
+ class FirecrawlApp:
439
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
440
+ """
441
+ Initialize the FirecrawlApp instance with API key, API URL.
442
+
443
+ Args:
444
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
445
+ api_url (Optional[str]): Base URL for the Firecrawl API.
446
+ """
447
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
448
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
449
+
450
+ # Only require API key when using cloud service
451
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
452
+ logger.warning("No API key provided for cloud service")
453
+ raise ValueError('No API key provided')
454
+
455
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
456
+
457
+ def scrape_url(
458
+ self,
459
+ url: str,
460
+ *,
461
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
462
+ headers: Optional[Dict[str, str]] = None,
463
+ include_tags: Optional[List[str]] = None,
464
+ exclude_tags: Optional[List[str]] = None,
465
+ only_main_content: Optional[bool] = None,
466
+ wait_for: Optional[int] = None,
467
+ timeout: Optional[int] = 30000,
468
+ location: Optional[LocationConfig] = None,
469
+ mobile: Optional[bool] = None,
470
+ skip_tls_verification: Optional[bool] = None,
471
+ remove_base64_images: Optional[bool] = None,
472
+ block_ads: Optional[bool] = None,
473
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
474
+ parse_pdf: Optional[bool] = None,
475
+ extract: Optional[JsonConfig] = None,
476
+ json_options: Optional[JsonConfig] = None,
477
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
478
+ change_tracking_options: Optional[ChangeTrackingOptions] = None,
479
+ max_age: Optional[int] = None,
480
+ store_in_cache: Optional[bool] = None,
481
+ zero_data_retention: Optional[bool] = None,
482
+ agent: Optional[AgentOptions] = None,
483
+ **kwargs) -> ScrapeResponse[Any]:
484
+ """
485
+ Scrape and extract content from a URL.
486
+
487
+ Args:
488
+ url (str): Target URL to scrape
489
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
490
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
491
+ include_tags (Optional[List[str]]): HTML tags to include
492
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
493
+ only_main_content (Optional[bool]): Extract main content only
494
+ wait_for (Optional[int]): Wait for a specific element to appear
495
+ timeout (Optional[int]): Request timeout (ms)
496
+ location (Optional[LocationConfig]): Location configuration
497
+ mobile (Optional[bool]): Use mobile user agent
498
+ skip_tls_verification (Optional[bool]): Skip TLS verification
499
+ remove_base64_images (Optional[bool]): Remove base64 images
500
+ block_ads (Optional[bool]): Block ads
501
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
502
+ extract (Optional[JsonConfig]): Content extraction settings
503
+ json_options (Optional[JsonConfig]): JSON extraction settings
504
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
505
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
506
+ zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
507
+ agent (Optional[AgentOptions]): Agent configuration for FIRE-1 model
508
+
509
+
510
+ Returns:
511
+ ScrapeResponse with:
512
+ * Requested content formats
513
+ * Page metadata
514
+ * Extraction results
515
+ * Success/error status
516
+
517
+ Raises:
518
+ Exception: If scraping fails
519
+ """
520
+ # Validate any additional kwargs
521
+ self._validate_kwargs(kwargs, "scrape_url")
522
+
523
+ _headers = self._prepare_headers()
524
+
525
+ # Build scrape parameters
526
+ scrape_params = {
527
+ 'url': url,
528
+ 'origin': f"python-sdk@{version}"
529
+ }
530
+
531
+ # Add optional parameters if provided
532
+ if formats:
533
+ scrape_params['formats'] = formats
534
+ if headers:
535
+ scrape_params['headers'] = headers
536
+ if include_tags:
537
+ scrape_params['includeTags'] = include_tags
538
+ if exclude_tags:
539
+ scrape_params['excludeTags'] = exclude_tags
540
+ if only_main_content is not None:
541
+ scrape_params['onlyMainContent'] = only_main_content
542
+ if wait_for:
543
+ scrape_params['waitFor'] = wait_for
544
+ if timeout:
545
+ scrape_params['timeout'] = timeout
546
+ if location:
547
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
548
+ if mobile is not None:
549
+ scrape_params['mobile'] = mobile
550
+ if skip_tls_verification is not None:
551
+ scrape_params['skipTlsVerification'] = skip_tls_verification
552
+ if remove_base64_images is not None:
553
+ scrape_params['removeBase64Images'] = remove_base64_images
554
+ if block_ads is not None:
555
+ scrape_params['blockAds'] = block_ads
556
+ if proxy:
557
+ scrape_params['proxy'] = proxy
558
+ if parse_pdf is not None:
559
+ scrape_params['parsePDF'] = parse_pdf
560
+ if extract is not None:
561
+ extract = self._ensure_schema_dict(extract)
562
+ if isinstance(extract, dict) and "schema" in extract:
563
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
564
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
565
+ if json_options is not None:
566
+ json_options = self._ensure_schema_dict(json_options)
567
+ if isinstance(json_options, dict) and "schema" in json_options:
568
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
569
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
570
+ if actions:
571
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
572
+ if change_tracking_options:
573
+ scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(by_alias=True, exclude_none=True)
574
+ if max_age is not None:
575
+ scrape_params['maxAge'] = max_age
576
+ if store_in_cache is not None:
577
+ scrape_params['storeInCache'] = store_in_cache
578
+ if zero_data_retention is not None:
579
+ scrape_params['zeroDataRetention'] = zero_data_retention
580
+ if agent is not None:
581
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
582
+
583
+ scrape_params.update(kwargs)
584
+
585
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
586
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
587
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
588
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
589
+
590
+ # Make request
591
+ response = requests.post(
592
+ f'{self.api_url}/v1/scrape',
593
+ headers=_headers,
594
+ json=scrape_params,
595
+ timeout=(timeout / 1000.0 + 5 if timeout is not None else None)
596
+ )
597
+
598
+ if response.status_code == 200:
599
+ try:
600
+ response_json = response.json()
601
+ if response_json.get('success') and 'data' in response_json:
602
+ return ScrapeResponse(**response_json['data'])
603
+ elif "error" in response_json:
604
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
605
+ else:
606
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
607
+ except ValueError:
608
+ raise Exception('Failed to parse Firecrawl response as JSON.')
609
+ else:
610
+ self._handle_error(response, 'scrape URL')
611
+
612
+ def search(
613
+ self,
614
+ query: str,
615
+ *,
616
+ limit: Optional[int] = None,
617
+ tbs: Optional[str] = None,
618
+ filter: Optional[str] = None,
619
+ lang: Optional[str] = None,
620
+ country: Optional[str] = None,
621
+ location: Optional[str] = None,
622
+ timeout: Optional[int] = 30000,
623
+ scrape_options: Optional[ScrapeOptions] = None,
624
+ **kwargs) -> SearchResponse:
625
+ """
626
+ Search for content using Firecrawl.
627
+
628
+ Args:
629
+ query (str): Search query string
630
+ limit (Optional[int]): Max results (default: 5)
631
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
632
+ filter (Optional[str]): Custom result filter
633
+ lang (Optional[str]): Language code (default: "en")
634
+ country (Optional[str]): Country code (default: "us")
635
+ location (Optional[str]): Geo-targeting
636
+ timeout (Optional[int]): Request timeout in milliseconds
637
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
638
+ **kwargs: Additional keyword arguments for future compatibility
639
+
640
+ Returns:
641
+ SearchResponse: Response containing:
642
+ * success (bool): Whether request succeeded
643
+ * data (List[FirecrawlDocument]): Search results
644
+ * warning (Optional[str]): Warning message if any
645
+ * error (Optional[str]): Error message if any
646
+
647
+ Raises:
648
+ Exception: If search fails or response cannot be parsed
649
+ """
650
+ # Validate any additional kwargs
651
+ self._validate_kwargs(kwargs, "search")
652
+
653
+ # Build search parameters
654
+ search_params = {}
655
+
656
+ # Add individual parameters
657
+ if limit is not None:
658
+ search_params['limit'] = limit
659
+ if tbs is not None:
660
+ search_params['tbs'] = tbs
661
+ if filter is not None:
662
+ search_params['filter'] = filter
663
+ if lang is not None:
664
+ search_params['lang'] = lang
665
+ if country is not None:
666
+ search_params['country'] = country
667
+ if location is not None:
668
+ search_params['location'] = location
669
+ if timeout is not None:
670
+ search_params['timeout'] = timeout
671
+ if scrape_options is not None:
672
+ search_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
673
+
674
+ # Add any additional kwargs
675
+ search_params.update(kwargs)
676
+ _integration = search_params.get('integration')
677
+
678
+ # Create final params object
679
+ final_params = SearchParams(query=query, **search_params)
680
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
681
+ params_dict['origin'] = f"python-sdk@{version}"
682
+
683
+ if _integration:
684
+ params_dict['integration'] = _integration
685
+
686
+ # Make request
687
+ response = requests.post(
688
+ f"{self.api_url}/v1/search",
689
+ headers={"Authorization": f"Bearer {self.api_key}"},
690
+ json=params_dict
691
+ )
692
+
693
+ if response.status_code == 200:
694
+ try:
695
+ response_json = response.json()
696
+ if response_json.get('success') and 'data' in response_json:
697
+ return SearchResponse(**response_json)
698
+ elif "error" in response_json:
699
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
700
+ else:
701
+ raise Exception(f'Search failed. Error: {response_json}')
702
+ except ValueError:
703
+ raise Exception('Failed to parse Firecrawl response as JSON.')
704
+ else:
705
+ self._handle_error(response, 'search')
706
+
707
+ def crawl_url(
708
+ self,
709
+ url: str,
710
+ *,
711
+ include_paths: Optional[List[str]] = None,
712
+ exclude_paths: Optional[List[str]] = None,
713
+ max_depth: Optional[int] = None,
714
+ max_discovery_depth: Optional[int] = None,
715
+ limit: Optional[int] = None,
716
+ allow_backward_links: Optional[bool] = None,
717
+ crawl_entire_domain: Optional[bool] = None,
718
+ allow_external_links: Optional[bool] = None,
719
+ ignore_sitemap: Optional[bool] = None,
720
+ scrape_options: Optional[ScrapeOptions] = None,
721
+ webhook: Optional[Union[str, WebhookConfig]] = None,
722
+ deduplicate_similar_urls: Optional[bool] = None,
723
+ ignore_query_parameters: Optional[bool] = None,
724
+ regex_on_full_url: Optional[bool] = None,
725
+ delay: Optional[int] = None,
726
+ allow_subdomains: Optional[bool] = None,
727
+ max_concurrency: Optional[int] = None,
728
+ zero_data_retention: Optional[bool] = None,
729
+ poll_interval: Optional[int] = 2,
730
+ idempotency_key: Optional[str] = None,
731
+ **kwargs
732
+ ) -> CrawlStatusResponse:
733
+ """
734
+ Crawl a website starting from a URL.
735
+
736
+ Args:
737
+ url (str): Target URL to start crawling from
738
+ include_paths (Optional[List[str]]): Patterns of URLs to include
739
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
740
+ max_depth (Optional[int]): Maximum crawl depth
741
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
742
+ limit (Optional[int]): Maximum pages to crawl
743
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
744
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
745
+ allow_external_links (Optional[bool]): Follow external domain links
746
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
747
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
748
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
749
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
750
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
751
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
752
+ delay (Optional[int]): Delay in seconds between scrapes
753
+ allow_subdomains (Optional[bool]): Follow subdomains
754
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
755
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
756
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
757
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
758
+ **kwargs: Additional parameters to pass to the API
759
+
760
+ Returns:
761
+ CrawlStatusResponse with:
762
+ * Crawling status and progress
763
+ * Crawled page contents
764
+ * Success/error information
765
+
766
+ Raises:
767
+ Exception: If crawl fails
768
+ """
769
+ # Validate any additional kwargs
770
+ self._validate_kwargs(kwargs, "crawl_url")
771
+
772
+ crawl_params = {}
773
+
774
+ # Add individual parameters
775
+ if include_paths is not None:
776
+ crawl_params['includePaths'] = include_paths
777
+ if exclude_paths is not None:
778
+ crawl_params['excludePaths'] = exclude_paths
779
+ if max_depth is not None:
780
+ crawl_params['maxDepth'] = max_depth
781
+ if max_discovery_depth is not None:
782
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
783
+ if limit is not None:
784
+ crawl_params['limit'] = limit
785
+ if crawl_entire_domain is not None:
786
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
787
+ elif allow_backward_links is not None:
788
+ crawl_params['allowBackwardLinks'] = allow_backward_links
789
+ if allow_external_links is not None:
790
+ crawl_params['allowExternalLinks'] = allow_external_links
791
+ if ignore_sitemap is not None:
792
+ crawl_params['ignoreSitemap'] = ignore_sitemap
793
+ if scrape_options is not None:
794
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
795
+ if webhook is not None:
796
+ crawl_params['webhook'] = webhook
797
+ if deduplicate_similar_urls is not None:
798
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
799
+ if ignore_query_parameters is not None:
800
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
801
+ if regex_on_full_url is not None:
802
+ crawl_params['regexOnFullURL'] = regex_on_full_url
803
+ if delay is not None:
804
+ crawl_params['delay'] = delay
805
+ if allow_subdomains is not None:
806
+ crawl_params['allowSubdomains'] = allow_subdomains
807
+ if max_concurrency is not None:
808
+ crawl_params['maxConcurrency'] = max_concurrency
809
+ if zero_data_retention is not None:
810
+ crawl_params['zeroDataRetention'] = zero_data_retention
811
+ # Add any additional kwargs
812
+ crawl_params.update(kwargs)
813
+ _integration = crawl_params.get('integration')
814
+
815
+ # Create final params object
816
+ final_params = CrawlParams(**crawl_params)
817
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
818
+ params_dict['url'] = url
819
+ params_dict['origin'] = f"python-sdk@{version}"
820
+
821
+ if _integration:
822
+ params_dict['integration'] = _integration
823
+
824
+ # Make request
825
+ headers = self._prepare_headers(idempotency_key)
826
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
827
+
828
+ if response.status_code == 200:
829
+ try:
830
+ id = response.json().get('id')
831
+ except:
832
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
833
+ return self._monitor_job_status(id, headers, poll_interval)
834
+ else:
835
+ self._handle_error(response, 'start crawl job')
836
+
837
+ def async_crawl_url(
838
+ self,
839
+ url: str,
840
+ *,
841
+ include_paths: Optional[List[str]] = None,
842
+ exclude_paths: Optional[List[str]] = None,
843
+ max_depth: Optional[int] = None,
844
+ max_discovery_depth: Optional[int] = None,
845
+ limit: Optional[int] = None,
846
+ allow_backward_links: Optional[bool] = None,
847
+ crawl_entire_domain: Optional[bool] = None,
848
+ allow_external_links: Optional[bool] = None,
849
+ ignore_sitemap: Optional[bool] = None,
850
+ scrape_options: Optional[ScrapeOptions] = None,
851
+ webhook: Optional[Union[str, WebhookConfig]] = None,
852
+ deduplicate_similar_urls: Optional[bool] = None,
853
+ ignore_query_parameters: Optional[bool] = None,
854
+ regex_on_full_url: Optional[bool] = None,
855
+ delay: Optional[int] = None,
856
+ allow_subdomains: Optional[bool] = None,
857
+ max_concurrency: Optional[int] = None,
858
+ zero_data_retention: Optional[bool] = None,
859
+ idempotency_key: Optional[str] = None,
860
+ **kwargs
861
+ ) -> CrawlResponse:
862
+ """
863
+ Start an asynchronous crawl job.
864
+
865
+ Args:
866
+ url (str): Target URL to start crawling from
867
+ include_paths (Optional[List[str]]): Patterns of URLs to include
868
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
869
+ max_depth (Optional[int]): Maximum crawl depth
870
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
871
+ limit (Optional[int]): Maximum pages to crawl
872
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
873
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
874
+ allow_external_links (Optional[bool]): Follow external domain links
875
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
876
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
877
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
878
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
879
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
880
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
881
+ delay (Optional[int]): Delay in seconds between scrapes
882
+ allow_subdomains (Optional[bool]): Follow subdomains
883
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
884
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
885
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
886
+ **kwargs: Additional parameters to pass to the API
887
+
888
+ Returns:
889
+ CrawlResponse with:
890
+ * success - Whether crawl started successfully
891
+ * id - Unique identifier for the crawl job
892
+ * url - Status check URL for the crawl
893
+ * error - Error message if start failed
894
+
895
+ Raises:
896
+ Exception: If crawl initiation fails
897
+ """
898
+ # Validate any additional kwargs
899
+ self._validate_kwargs(kwargs, "async_crawl_url")
900
+
901
+ crawl_params = {}
902
+
903
+ # Add individual parameters
904
+ if include_paths is not None:
905
+ crawl_params['includePaths'] = include_paths
906
+ if exclude_paths is not None:
907
+ crawl_params['excludePaths'] = exclude_paths
908
+ if max_depth is not None:
909
+ crawl_params['maxDepth'] = max_depth
910
+ if max_discovery_depth is not None:
911
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
912
+ if limit is not None:
913
+ crawl_params['limit'] = limit
914
+ if crawl_entire_domain is not None:
915
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
916
+ elif allow_backward_links is not None:
917
+ crawl_params['allowBackwardLinks'] = allow_backward_links
918
+ if allow_external_links is not None:
919
+ crawl_params['allowExternalLinks'] = allow_external_links
920
+ if ignore_sitemap is not None:
921
+ crawl_params['ignoreSitemap'] = ignore_sitemap
922
+ if scrape_options is not None:
923
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
924
+ if webhook is not None:
925
+ crawl_params['webhook'] = webhook
926
+ if deduplicate_similar_urls is not None:
927
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
928
+ if ignore_query_parameters is not None:
929
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
930
+ if regex_on_full_url is not None:
931
+ crawl_params['regexOnFullURL'] = regex_on_full_url
932
+ if delay is not None:
933
+ crawl_params['delay'] = delay
934
+ if allow_subdomains is not None:
935
+ crawl_params['allowSubdomains'] = allow_subdomains
936
+ if max_concurrency is not None:
937
+ crawl_params['maxConcurrency'] = max_concurrency
938
+ if zero_data_retention is not None:
939
+ crawl_params['zeroDataRetention'] = zero_data_retention
940
+ # Add any additional kwargs
941
+ crawl_params.update(kwargs)
942
+
943
+ # Create final params object
944
+ final_params = CrawlParams(**crawl_params)
945
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
946
+ params_dict['url'] = url
947
+ params_dict['origin'] = f"python-sdk@{version}"
948
+
949
+ # Make request
950
+ headers = self._prepare_headers(idempotency_key)
951
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
952
+
953
+ if response.status_code == 200:
954
+ try:
955
+ return CrawlResponse(**response.json())
956
+ except:
957
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
958
+ else:
959
+ self._handle_error(response, 'start crawl job')
960
+
961
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
962
+ """
963
+ Check the status and results of a crawl job.
964
+
965
+ Args:
966
+ id: Unique identifier for the crawl job
967
+
968
+ Returns:
969
+ CrawlStatusResponse containing:
970
+
971
+ Status Information:
972
+ * status - Current state (scraping/completed/failed/cancelled)
973
+ * completed - Number of pages crawled
974
+ * total - Total pages to crawl
975
+ * creditsUsed - API credits consumed
976
+ * expiresAt - Data expiration timestamp
977
+
978
+ Results:
979
+ * data - List of crawled documents
980
+ * next - URL for next page of results (if paginated)
981
+ * success - Whether status check succeeded
982
+ * error - Error message if failed
983
+
984
+ Raises:
985
+ Exception: If status check fails
986
+ """
987
+ endpoint = f'/v1/crawl/{id}'
988
+
989
+ headers = self._prepare_headers()
990
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
991
+ if response.status_code == 200:
992
+ try:
993
+ status_data = response.json()
994
+ except:
995
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
996
+ if status_data['status'] == 'completed':
997
+ if 'data' in status_data:
998
+ data = status_data['data']
999
+ while 'next' in status_data:
1000
+ if len(status_data['data']) == 0:
1001
+ break
1002
+ next_url = status_data.get('next')
1003
+ if not next_url:
1004
+ logger.warning("Expected 'next' URL is missing.")
1005
+ break
1006
+ try:
1007
+ status_response = self._get_request(next_url, headers)
1008
+ if status_response.status_code != 200:
1009
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1010
+ break
1011
+ try:
1012
+ next_data = status_response.json()
1013
+ except:
1014
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1015
+ data.extend(next_data.get('data', []))
1016
+ status_data = next_data
1017
+ except Exception as e:
1018
+ logger.error(f"Error during pagination request: {e}")
1019
+ break
1020
+ status_data['data'] = data
1021
+
1022
+ response = {
1023
+ 'status': status_data.get('status'),
1024
+ 'total': status_data.get('total'),
1025
+ 'completed': status_data.get('completed'),
1026
+ 'creditsUsed': status_data.get('creditsUsed'),
1027
+ 'expiresAt': status_data.get('expiresAt'),
1028
+ 'data': status_data.get('data')
1029
+ }
1030
+
1031
+ if 'error' in status_data:
1032
+ response['error'] = status_data['error']
1033
+
1034
+ if 'next' in status_data:
1035
+ response['next'] = status_data['next']
1036
+
1037
+ return CrawlStatusResponse(
1038
+ success=False if 'error' in status_data else True,
1039
+ **response
1040
+ )
1041
+ else:
1042
+ self._handle_error(response, 'check crawl status')
1043
+
1044
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
1045
+ """
1046
+ Returns information about crawl errors.
1047
+
1048
+ Args:
1049
+ id (str): The ID of the crawl job
1050
+
1051
+ Returns:
1052
+ CrawlErrorsResponse containing:
1053
+ * errors (List[Dict[str, str]]): List of errors with fields:
1054
+ - id (str): Error ID
1055
+ - timestamp (str): When the error occurred
1056
+ - url (str): URL that caused the error
1057
+ - error (str): Error message
1058
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1059
+
1060
+ Raises:
1061
+ Exception: If error check fails
1062
+ """
1063
+ headers = self._prepare_headers()
1064
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1065
+ if response.status_code == 200:
1066
+ try:
1067
+ return CrawlErrorsResponse(**response.json())
1068
+ except:
1069
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1070
+ else:
1071
+ self._handle_error(response, "check crawl errors")
1072
+
1073
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
1074
+ """
1075
+ Cancel an asynchronous crawl job.
1076
+
1077
+ Args:
1078
+ id (str): The ID of the crawl job to cancel
1079
+
1080
+ Returns:
1081
+ Dict[str, Any] containing:
1082
+ * success (bool): Whether cancellation was successful
1083
+ * error (str, optional): Error message if cancellation failed
1084
+
1085
+ Raises:
1086
+ Exception: If cancellation fails
1087
+ """
1088
+ headers = self._prepare_headers()
1089
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1090
+ if response.status_code == 200:
1091
+ try:
1092
+ return response.json()
1093
+ except:
1094
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1095
+ else:
1096
+ self._handle_error(response, "cancel crawl job")
1097
+
1098
+ def crawl_url_and_watch(
1099
+ self,
1100
+ url: str,
1101
+ *,
1102
+ include_paths: Optional[List[str]] = None,
1103
+ exclude_paths: Optional[List[str]] = None,
1104
+ max_depth: Optional[int] = None,
1105
+ max_discovery_depth: Optional[int] = None,
1106
+ limit: Optional[int] = None,
1107
+ allow_backward_links: Optional[bool] = None,
1108
+ crawl_entire_domain: Optional[bool] = None,
1109
+ allow_external_links: Optional[bool] = None,
1110
+ ignore_sitemap: Optional[bool] = None,
1111
+ scrape_options: Optional[ScrapeOptions] = None,
1112
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1113
+ deduplicate_similar_urls: Optional[bool] = None,
1114
+ ignore_query_parameters: Optional[bool] = None,
1115
+ regex_on_full_url: Optional[bool] = None,
1116
+ delay: Optional[int] = None,
1117
+ allow_subdomains: Optional[bool] = None,
1118
+ max_concurrency: Optional[int] = None,
1119
+ zero_data_retention: Optional[bool] = None,
1120
+ idempotency_key: Optional[str] = None,
1121
+ **kwargs
1122
+ ) -> 'CrawlWatcher':
1123
+ """
1124
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1125
+
1126
+ Args:
1127
+ url (str): Target URL to start crawling from
1128
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1129
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1130
+ max_depth (Optional[int]): Maximum crawl depth
1131
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1132
+ limit (Optional[int]): Maximum pages to crawl
1133
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1134
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
1135
+ allow_external_links (Optional[bool]): Follow external domain links
1136
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1137
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1138
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1139
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1140
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1141
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1142
+ delay (Optional[int]): Delay in seconds between scrapes
1143
+ allow_subdomains (Optional[bool]): Follow subdomains
1144
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1145
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1146
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1147
+ **kwargs: Additional parameters to pass to the API
1148
+
1149
+ Returns:
1150
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1151
+
1152
+ Raises:
1153
+ Exception: If crawl job fails to start
1154
+ """
1155
+ crawl_response = self.async_crawl_url(
1156
+ url,
1157
+ include_paths=include_paths,
1158
+ exclude_paths=exclude_paths,
1159
+ max_depth=max_depth,
1160
+ max_discovery_depth=max_discovery_depth,
1161
+ limit=limit,
1162
+ allow_backward_links=allow_backward_links,
1163
+ crawl_entire_domain=crawl_entire_domain,
1164
+ allow_external_links=allow_external_links,
1165
+ ignore_sitemap=ignore_sitemap,
1166
+ scrape_options=scrape_options,
1167
+ webhook=webhook,
1168
+ deduplicate_similar_urls=deduplicate_similar_urls,
1169
+ ignore_query_parameters=ignore_query_parameters,
1170
+ regex_on_full_url=regex_on_full_url,
1171
+ delay=delay,
1172
+ allow_subdomains=allow_subdomains,
1173
+ max_concurrency=max_concurrency,
1174
+ zero_data_retention=zero_data_retention,
1175
+ idempotency_key=idempotency_key,
1176
+ **kwargs
1177
+ )
1178
+ if crawl_response.success and crawl_response.id:
1179
+ return CrawlWatcher(crawl_response.id, self)
1180
+ else:
1181
+ raise Exception("Crawl job failed to start")
1182
+
1183
+ def map_url(
1184
+ self,
1185
+ url: str,
1186
+ *,
1187
+ search: Optional[str] = None,
1188
+ ignore_sitemap: Optional[bool] = None,
1189
+ include_subdomains: Optional[bool] = None,
1190
+ sitemap_only: Optional[bool] = None,
1191
+ limit: Optional[int] = None,
1192
+ timeout: Optional[int] = 30000,
1193
+ use_index: Optional[bool] = None,
1194
+ **kwargs) -> MapResponse:
1195
+ """
1196
+ Map and discover links from a URL.
1197
+
1198
+ Args:
1199
+ url (str): Target URL to map
1200
+ search (Optional[str]): Filter pattern for URLs
1201
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1202
+ include_subdomains (Optional[bool]): Include subdomain links
1203
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1204
+ limit (Optional[int]): Maximum URLs to return
1205
+ timeout (Optional[int]): Request timeout in milliseconds
1206
+ **kwargs: Additional parameters to pass to the API
1207
+
1208
+ Returns:
1209
+ MapResponse: Response containing:
1210
+ * success (bool): Whether request succeeded
1211
+ * links (List[str]): Discovered URLs
1212
+ * error (Optional[str]): Error message if any
1213
+
1214
+ Raises:
1215
+ Exception: If mapping fails or response cannot be parsed
1216
+ """
1217
+ # Validate any additional kwargs
1218
+ self._validate_kwargs(kwargs, "map_url")
1219
+
1220
+ # Build map parameters
1221
+ map_params = {}
1222
+
1223
+ # Add individual parameters
1224
+ if search is not None:
1225
+ map_params['search'] = search
1226
+ if ignore_sitemap is not None:
1227
+ map_params['ignoreSitemap'] = ignore_sitemap
1228
+ if include_subdomains is not None:
1229
+ map_params['includeSubdomains'] = include_subdomains
1230
+ if sitemap_only is not None:
1231
+ map_params['sitemapOnly'] = sitemap_only
1232
+ if limit is not None:
1233
+ map_params['limit'] = limit
1234
+ if timeout is not None:
1235
+ map_params['timeout'] = timeout
1236
+ if use_index is not None:
1237
+ map_params['useIndex'] = use_index
1238
+
1239
+ # Add any additional kwargs
1240
+ map_params.update(kwargs)
1241
+ _integration = map_params.get('integration')
1242
+
1243
+ # Create final params object
1244
+ final_params = MapParams(**map_params)
1245
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1246
+ params_dict['url'] = url
1247
+ params_dict['origin'] = f"python-sdk@{version}"
1248
+
1249
+ if _integration:
1250
+ params_dict['integration'] = _integration
1251
+
1252
+ # Make request
1253
+ response = requests.post(
1254
+ f"{self.api_url}/v1/map",
1255
+ headers={"Authorization": f"Bearer {self.api_key}"},
1256
+ json=params_dict
1257
+ )
1258
+
1259
+ if response.status_code == 200:
1260
+ try:
1261
+ response_json = response.json()
1262
+ if response_json.get('success') and 'links' in response_json:
1263
+ return MapResponse(**response_json)
1264
+ elif "error" in response_json:
1265
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1266
+ else:
1267
+ raise Exception(f'Map failed. Error: {response_json}')
1268
+ except ValueError:
1269
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1270
+ else:
1271
+ self._handle_error(response, 'map')
1272
+
1273
+ def batch_scrape_urls(
1274
+ self,
1275
+ urls: List[str],
1276
+ *,
1277
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1278
+ headers: Optional[Dict[str, str]] = None,
1279
+ include_tags: Optional[List[str]] = None,
1280
+ exclude_tags: Optional[List[str]] = None,
1281
+ only_main_content: Optional[bool] = None,
1282
+ wait_for: Optional[int] = None,
1283
+ timeout: Optional[int] = 30000,
1284
+ location: Optional[LocationConfig] = None,
1285
+ mobile: Optional[bool] = None,
1286
+ skip_tls_verification: Optional[bool] = None,
1287
+ remove_base64_images: Optional[bool] = None,
1288
+ block_ads: Optional[bool] = None,
1289
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1290
+ extract: Optional[JsonConfig] = None,
1291
+ json_options: Optional[JsonConfig] = None,
1292
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1293
+ agent: Optional[AgentOptions] = None,
1294
+ poll_interval: Optional[int] = 2,
1295
+ max_concurrency: Optional[int] = None,
1296
+ zero_data_retention: Optional[bool] = None,
1297
+ idempotency_key: Optional[str] = None,
1298
+ **kwargs
1299
+ ) -> BatchScrapeStatusResponse:
1300
+ """
1301
+ Batch scrape multiple URLs and monitor until completion.
1302
+
1303
+ Args:
1304
+ urls (List[str]): URLs to scrape
1305
+ formats (Optional[List[Literal]]): Content formats to retrieve
1306
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1307
+ include_tags (Optional[List[str]]): HTML tags to include
1308
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1309
+ only_main_content (Optional[bool]): Extract main content only
1310
+ wait_for (Optional[int]): Wait time in milliseconds
1311
+ timeout (Optional[int]): Request timeout in milliseconds
1312
+ location (Optional[LocationConfig]): Location configuration
1313
+ mobile (Optional[bool]): Use mobile user agent
1314
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1315
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1316
+ block_ads (Optional[bool]): Block advertisements
1317
+ proxy (Optional[Literal]): Proxy type to use
1318
+ extract (Optional[JsonConfig]): Content extraction config
1319
+ json_options (Optional[JsonConfig]): JSON extraction config
1320
+ actions (Optional[List[Union]]): Actions to perform
1321
+ agent (Optional[AgentOptions]): Agent configuration
1322
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1323
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1324
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1325
+ **kwargs: Additional parameters to pass to the API
1326
+
1327
+ Returns:
1328
+ BatchScrapeStatusResponse with:
1329
+ * Scraping status and progress
1330
+ * Scraped content for each URL
1331
+ * Success/error information
1332
+
1333
+ Raises:
1334
+ Exception: If batch scrape fails
1335
+ """
1336
+ # Validate any additional kwargs
1337
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1338
+
1339
+ scrape_params = {}
1340
+
1341
+ # Add individual parameters
1342
+ if formats is not None:
1343
+ scrape_params['formats'] = formats
1344
+ if headers is not None:
1345
+ scrape_params['headers'] = headers
1346
+ if include_tags is not None:
1347
+ scrape_params['includeTags'] = include_tags
1348
+ if exclude_tags is not None:
1349
+ scrape_params['excludeTags'] = exclude_tags
1350
+ if only_main_content is not None:
1351
+ scrape_params['onlyMainContent'] = only_main_content
1352
+ if wait_for is not None:
1353
+ scrape_params['waitFor'] = wait_for
1354
+ if timeout is not None:
1355
+ scrape_params['timeout'] = timeout
1356
+ if location is not None:
1357
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
1358
+ if mobile is not None:
1359
+ scrape_params['mobile'] = mobile
1360
+ if skip_tls_verification is not None:
1361
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1362
+ if remove_base64_images is not None:
1363
+ scrape_params['removeBase64Images'] = remove_base64_images
1364
+ if block_ads is not None:
1365
+ scrape_params['blockAds'] = block_ads
1366
+ if proxy is not None:
1367
+ scrape_params['proxy'] = proxy
1368
+ if extract is not None:
1369
+ extract = self._ensure_schema_dict(extract)
1370
+ if isinstance(extract, dict) and "schema" in extract:
1371
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1372
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
1373
+ if json_options is not None:
1374
+ json_options = self._ensure_schema_dict(json_options)
1375
+ if isinstance(json_options, dict) and "schema" in json_options:
1376
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1377
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
1378
+ if actions:
1379
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
1380
+ if agent is not None:
1381
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
1382
+ if max_concurrency is not None:
1383
+ scrape_params['maxConcurrency'] = max_concurrency
1384
+ if zero_data_retention is not None:
1385
+ scrape_params['zeroDataRetention'] = zero_data_retention
1386
+
1387
+ # Add any additional kwargs
1388
+ scrape_params.update(kwargs)
1389
+
1390
+ # Create final params object
1391
+ final_params = ScrapeParams(**scrape_params)
1392
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1393
+ params_dict['urls'] = urls
1394
+ params_dict['origin'] = f"python-sdk@{version}"
1395
+
1396
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1397
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1398
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1399
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1400
+
1401
+ # Make request
1402
+ headers = self._prepare_headers(idempotency_key)
1403
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1404
+
1405
+ if response.status_code == 200:
1406
+ try:
1407
+ id = response.json().get('id')
1408
+ except:
1409
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1410
+ return self._monitor_job_status(id, headers, poll_interval)
1411
+ else:
1412
+ self._handle_error(response, 'start batch scrape job')
1413
+
1414
+ def async_batch_scrape_urls(
1415
+ self,
1416
+ urls: List[str],
1417
+ *,
1418
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1419
+ headers: Optional[Dict[str, str]] = None,
1420
+ include_tags: Optional[List[str]] = None,
1421
+ exclude_tags: Optional[List[str]] = None,
1422
+ only_main_content: Optional[bool] = None,
1423
+ wait_for: Optional[int] = None,
1424
+ timeout: Optional[int] = 30000,
1425
+ location: Optional[LocationConfig] = None,
1426
+ mobile: Optional[bool] = None,
1427
+ skip_tls_verification: Optional[bool] = None,
1428
+ remove_base64_images: Optional[bool] = None,
1429
+ block_ads: Optional[bool] = None,
1430
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1431
+ extract: Optional[JsonConfig] = None,
1432
+ json_options: Optional[JsonConfig] = None,
1433
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1434
+ agent: Optional[AgentOptions] = None,
1435
+ max_concurrency: Optional[int] = None,
1436
+ idempotency_key: Optional[str] = None,
1437
+ zero_data_retention: Optional[bool] = None,
1438
+ **kwargs
1439
+ ) -> BatchScrapeResponse:
1440
+ """
1441
+ Initiate a batch scrape job asynchronously.
1442
+
1443
+ Args:
1444
+ urls (List[str]): URLs to scrape
1445
+ formats (Optional[List[Literal]]): Content formats to retrieve
1446
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1447
+ include_tags (Optional[List[str]]): HTML tags to include
1448
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1449
+ only_main_content (Optional[bool]): Extract main content only
1450
+ wait_for (Optional[int]): Wait time in milliseconds
1451
+ timeout (Optional[int]): Request timeout in milliseconds
1452
+ location (Optional[LocationConfig]): Location configuration
1453
+ mobile (Optional[bool]): Use mobile user agent
1454
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1455
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1456
+ block_ads (Optional[bool]): Block advertisements
1457
+ proxy (Optional[Literal]): Proxy type to use
1458
+ extract (Optional[JsonConfig]): Content extraction config
1459
+ json_options (Optional[JsonConfig]): JSON extraction config
1460
+ actions (Optional[List[Union]]): Actions to perform
1461
+ agent (Optional[AgentOptions]): Agent configuration
1462
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1463
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1464
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1465
+ **kwargs: Additional parameters to pass to the API
1466
+
1467
+ Returns:
1468
+ BatchScrapeResponse with:
1469
+ * success - Whether job started successfully
1470
+ * id - Unique identifier for the job
1471
+ * url - Status check URL
1472
+ * error - Error message if start failed
1473
+
1474
+ Raises:
1475
+ Exception: If job initiation fails
1476
+ """
1477
+ # Validate any additional kwargs
1478
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1479
+
1480
+ scrape_params = {}
1481
+
1482
+ # Add individual parameters
1483
+ if formats is not None:
1484
+ scrape_params['formats'] = formats
1485
+ if headers is not None:
1486
+ scrape_params['headers'] = headers
1487
+ if include_tags is not None:
1488
+ scrape_params['includeTags'] = include_tags
1489
+ if exclude_tags is not None:
1490
+ scrape_params['excludeTags'] = exclude_tags
1491
+ if only_main_content is not None:
1492
+ scrape_params['onlyMainContent'] = only_main_content
1493
+ if wait_for is not None:
1494
+ scrape_params['waitFor'] = wait_for
1495
+ if timeout is not None:
1496
+ scrape_params['timeout'] = timeout
1497
+ if location is not None:
1498
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
1499
+ if mobile is not None:
1500
+ scrape_params['mobile'] = mobile
1501
+ if skip_tls_verification is not None:
1502
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1503
+ if remove_base64_images is not None:
1504
+ scrape_params['removeBase64Images'] = remove_base64_images
1505
+ if block_ads is not None:
1506
+ scrape_params['blockAds'] = block_ads
1507
+ if proxy is not None:
1508
+ scrape_params['proxy'] = proxy
1509
+ if extract is not None:
1510
+ extract = self._ensure_schema_dict(extract)
1511
+ if isinstance(extract, dict) and "schema" in extract:
1512
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1513
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
1514
+ if json_options is not None:
1515
+ json_options = self._ensure_schema_dict(json_options)
1516
+ if isinstance(json_options, dict) and "schema" in json_options:
1517
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1518
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
1519
+ if actions:
1520
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
1521
+ if agent is not None:
1522
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
1523
+ if max_concurrency is not None:
1524
+ scrape_params['maxConcurrency'] = max_concurrency
1525
+ if zero_data_retention is not None:
1526
+ scrape_params['zeroDataRetention'] = zero_data_retention
1527
+
1528
+ # Add any additional kwargs
1529
+ scrape_params.update(kwargs)
1530
+
1531
+ # Create final params object
1532
+ final_params = ScrapeParams(**scrape_params)
1533
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1534
+ params_dict['urls'] = urls
1535
+ params_dict['origin'] = f"python-sdk@{version}"
1536
+
1537
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1538
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1539
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1540
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1541
+
1542
+ # Make request
1543
+ headers = self._prepare_headers(idempotency_key)
1544
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1545
+
1546
+ if response.status_code == 200:
1547
+ try:
1548
+ return BatchScrapeResponse(**response.json())
1549
+ except:
1550
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1551
+ else:
1552
+ self._handle_error(response, 'start batch scrape job')
1553
+
1554
+ def batch_scrape_urls_and_watch(
1555
+ self,
1556
+ urls: List[str],
1557
+ *,
1558
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1559
+ headers: Optional[Dict[str, str]] = None,
1560
+ include_tags: Optional[List[str]] = None,
1561
+ exclude_tags: Optional[List[str]] = None,
1562
+ only_main_content: Optional[bool] = None,
1563
+ wait_for: Optional[int] = None,
1564
+ timeout: Optional[int] = 30000,
1565
+ location: Optional[LocationConfig] = None,
1566
+ mobile: Optional[bool] = None,
1567
+ skip_tls_verification: Optional[bool] = None,
1568
+ remove_base64_images: Optional[bool] = None,
1569
+ block_ads: Optional[bool] = None,
1570
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1571
+ extract: Optional[JsonConfig] = None,
1572
+ json_options: Optional[JsonConfig] = None,
1573
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1574
+ agent: Optional[AgentOptions] = None,
1575
+ max_concurrency: Optional[int] = None,
1576
+ zero_data_retention: Optional[bool] = None,
1577
+ idempotency_key: Optional[str] = None,
1578
+ **kwargs
1579
+ ) -> 'CrawlWatcher':
1580
+ """
1581
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1582
+
1583
+ Args:
1584
+ urls (List[str]): URLs to scrape
1585
+ formats (Optional[List[Literal]]): Content formats to retrieve
1586
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1587
+ include_tags (Optional[List[str]]): HTML tags to include
1588
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1589
+ only_main_content (Optional[bool]): Extract main content only
1590
+ wait_for (Optional[int]): Wait time in milliseconds
1591
+ timeout (Optional[int]): Request timeout in milliseconds
1592
+ location (Optional[LocationConfig]): Location configuration
1593
+ mobile (Optional[bool]): Use mobile user agent
1594
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1595
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1596
+ block_ads (Optional[bool]): Block advertisements
1597
+ proxy (Optional[Literal]): Proxy type to use
1598
+ extract (Optional[JsonConfig]): Content extraction config
1599
+ json_options (Optional[JsonConfig]): JSON extraction config
1600
+ actions (Optional[List[Union]]): Actions to perform
1601
+ agent (Optional[AgentOptions]): Agent configuration
1602
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1603
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1604
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1605
+ **kwargs: Additional parameters to pass to the API
1606
+
1607
+ Returns:
1608
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1609
+
1610
+ Raises:
1611
+ Exception: If batch scrape job fails to start
1612
+ """
1613
+ # Validate any additional kwargs
1614
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1615
+
1616
+ scrape_params = {}
1617
+
1618
+ # Add individual parameters
1619
+ if formats is not None:
1620
+ scrape_params['formats'] = formats
1621
+ if headers is not None:
1622
+ scrape_params['headers'] = headers
1623
+ if include_tags is not None:
1624
+ scrape_params['includeTags'] = include_tags
1625
+ if exclude_tags is not None:
1626
+ scrape_params['excludeTags'] = exclude_tags
1627
+ if only_main_content is not None:
1628
+ scrape_params['onlyMainContent'] = only_main_content
1629
+ if wait_for is not None:
1630
+ scrape_params['waitFor'] = wait_for
1631
+ if timeout is not None:
1632
+ scrape_params['timeout'] = timeout
1633
+ if location is not None:
1634
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
1635
+ if mobile is not None:
1636
+ scrape_params['mobile'] = mobile
1637
+ if skip_tls_verification is not None:
1638
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1639
+ if remove_base64_images is not None:
1640
+ scrape_params['removeBase64Images'] = remove_base64_images
1641
+ if block_ads is not None:
1642
+ scrape_params['blockAds'] = block_ads
1643
+ if proxy is not None:
1644
+ scrape_params['proxy'] = proxy
1645
+ if extract is not None:
1646
+ extract = self._ensure_schema_dict(extract)
1647
+ if isinstance(extract, dict) and "schema" in extract:
1648
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1649
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
1650
+ if json_options is not None:
1651
+ json_options = self._ensure_schema_dict(json_options)
1652
+ if isinstance(json_options, dict) and "schema" in json_options:
1653
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1654
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
1655
+ if actions:
1656
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
1657
+ if agent is not None:
1658
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
1659
+ if max_concurrency is not None:
1660
+ scrape_params['maxConcurrency'] = max_concurrency
1661
+ if zero_data_retention is not None:
1662
+ scrape_params['zeroDataRetention'] = zero_data_retention
1663
+
1664
+ # Add any additional kwargs
1665
+ scrape_params.update(kwargs)
1666
+
1667
+ # Create final params object
1668
+ final_params = ScrapeParams(**scrape_params)
1669
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1670
+ params_dict['urls'] = urls
1671
+ params_dict['origin'] = f"python-sdk@{version}"
1672
+
1673
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1674
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1675
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1676
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1677
+
1678
+ # Make request
1679
+ headers = self._prepare_headers(idempotency_key)
1680
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1681
+
1682
+ if response.status_code == 200:
1683
+ try:
1684
+ crawl_response = BatchScrapeResponse(**response.json())
1685
+ if crawl_response.success and crawl_response.id:
1686
+ return CrawlWatcher(crawl_response.id, self)
1687
+ else:
1688
+ raise Exception("Batch scrape job failed to start")
1689
+ except:
1690
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1691
+ else:
1692
+ self._handle_error(response, 'start batch scrape job')
1693
+
1694
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1695
+ """
1696
+ Check the status of a batch scrape job using the Firecrawl API.
1697
+
1698
+ Args:
1699
+ id (str): The ID of the batch scrape job.
1700
+
1701
+ Returns:
1702
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1703
+
1704
+ Raises:
1705
+ Exception: If the status check request fails.
1706
+ """
1707
+ endpoint = f'/v1/batch/scrape/{id}'
1708
+
1709
+ headers = self._prepare_headers()
1710
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1711
+ if response.status_code == 200:
1712
+ try:
1713
+ status_data = response.json()
1714
+ except:
1715
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1716
+ if status_data['status'] == 'completed':
1717
+ if 'data' in status_data:
1718
+ data = status_data['data']
1719
+ while 'next' in status_data:
1720
+ if len(status_data['data']) == 0:
1721
+ break
1722
+ next_url = status_data.get('next')
1723
+ if not next_url:
1724
+ logger.warning("Expected 'next' URL is missing.")
1725
+ break
1726
+ try:
1727
+ status_response = self._get_request(next_url, headers)
1728
+ if status_response.status_code != 200:
1729
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1730
+ break
1731
+ try:
1732
+ next_data = status_response.json()
1733
+ except:
1734
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1735
+ data.extend(next_data.get('data', []))
1736
+ status_data = next_data
1737
+ except Exception as e:
1738
+ logger.error(f"Error during pagination request: {e}")
1739
+ break
1740
+ status_data['data'] = data
1741
+
1742
+ return BatchScrapeStatusResponse(**{
1743
+ 'success': False if 'error' in status_data else True,
1744
+ 'status': status_data.get('status'),
1745
+ 'total': status_data.get('total'),
1746
+ 'completed': status_data.get('completed'),
1747
+ 'creditsUsed': status_data.get('creditsUsed'),
1748
+ 'expiresAt': status_data.get('expiresAt'),
1749
+ 'data': status_data.get('data'),
1750
+ 'next': status_data.get('next'),
1751
+ 'error': status_data.get('error')
1752
+ })
1753
+ else:
1754
+ self._handle_error(response, 'check batch scrape status')
1755
+
1756
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1757
+ """
1758
+ Returns information about batch scrape errors.
1759
+
1760
+ Args:
1761
+ id (str): The ID of the crawl job.
1762
+
1763
+ Returns:
1764
+ CrawlErrorsResponse containing:
1765
+ * errors (List[Dict[str, str]]): List of errors with fields:
1766
+ * id (str): Error ID
1767
+ * timestamp (str): When the error occurred
1768
+ * url (str): URL that caused the error
1769
+ * error (str): Error message
1770
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1771
+
1772
+ Raises:
1773
+ Exception: If the error check request fails
1774
+ """
1775
+ headers = self._prepare_headers()
1776
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1777
+ if response.status_code == 200:
1778
+ try:
1779
+ return CrawlErrorsResponse(**response.json())
1780
+ except:
1781
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1782
+ else:
1783
+ self._handle_error(response, "check batch scrape errors")
1784
+
1785
+ def extract(
1786
+ self,
1787
+ urls: Optional[List[str]] = None,
1788
+ *,
1789
+ prompt: Optional[str] = None,
1790
+ schema: Optional[Any] = None,
1791
+ system_prompt: Optional[str] = None,
1792
+ allow_external_links: Optional[bool] = False,
1793
+ enable_web_search: Optional[bool] = False,
1794
+ show_sources: Optional[bool] = False,
1795
+ agent: Optional[Dict[str, Any]] = None,
1796
+ **kwargs) -> ExtractResponse[Any]:
1797
+ """
1798
+ Extract structured information from URLs.
1799
+
1800
+ Args:
1801
+ urls (Optional[List[str]]): URLs to extract from
1802
+ prompt (Optional[str]): Custom extraction prompt
1803
+ schema (Optional[Any]): JSON schema/Pydantic model
1804
+ system_prompt (Optional[str]): System context
1805
+ allow_external_links (Optional[bool]): Follow external links
1806
+ enable_web_search (Optional[bool]): Enable web search
1807
+ show_sources (Optional[bool]): Include source URLs
1808
+ agent (Optional[Dict[str, Any]]): Agent configuration
1809
+ **kwargs: Additional parameters to pass to the API
1810
+
1811
+ Returns:
1812
+ ExtractResponse[Any] with:
1813
+ * success (bool): Whether request succeeded
1814
+ * data (Optional[Any]): Extracted data matching schema
1815
+ * error (Optional[str]): Error message if any
1816
+
1817
+ Raises:
1818
+ ValueError: If prompt/schema missing or extraction fails
1819
+ """
1820
+ # Validate any additional kwargs
1821
+ self._validate_kwargs(kwargs, "extract")
1822
+
1823
+ headers = self._prepare_headers()
1824
+
1825
+ if not prompt and not schema:
1826
+ raise ValueError("Either prompt or schema is required")
1827
+
1828
+ if not urls and not prompt:
1829
+ raise ValueError("Either urls or prompt is required")
1830
+
1831
+ if schema:
1832
+ schema = self._ensure_schema_dict(schema)
1833
+
1834
+ request_data = {
1835
+ 'urls': urls or [],
1836
+ 'allowExternalLinks': allow_external_links,
1837
+ 'enableWebSearch': enable_web_search,
1838
+ 'showSources': show_sources,
1839
+ 'schema': schema,
1840
+ 'origin': f'python-sdk@{get_version()}'
1841
+ }
1842
+
1843
+ # Only add prompt and systemPrompt if they exist
1844
+ if prompt:
1845
+ request_data['prompt'] = prompt
1846
+ if system_prompt:
1847
+ request_data['systemPrompt'] = system_prompt
1848
+
1849
+ if agent:
1850
+ request_data['agent'] = agent
1851
+
1852
+ # Add any additional kwargs
1853
+ request_data.update(kwargs)
1854
+
1855
+ try:
1856
+ # Send the initial extract request
1857
+ response = self._post_request(
1858
+ f'{self.api_url}/v1/extract',
1859
+ request_data,
1860
+ headers
1861
+ )
1862
+ if response.status_code == 200:
1863
+ try:
1864
+ data = response.json()
1865
+ except:
1866
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1867
+ if data['success']:
1868
+ job_id = data.get('id')
1869
+ if not job_id:
1870
+ raise Exception('Job ID not returned from extract request.')
1871
+
1872
+ # Poll for the extract status
1873
+ while True:
1874
+ status_response = self._get_request(
1875
+ f'{self.api_url}/v1/extract/{job_id}',
1876
+ headers
1877
+ )
1878
+ if status_response.status_code == 200:
1879
+ try:
1880
+ status_data = status_response.json()
1881
+ except:
1882
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1883
+ if status_data['status'] == 'completed':
1884
+ return ExtractResponse(**status_data)
1885
+ elif status_data['status'] in ['failed', 'cancelled']:
1886
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1887
+ else:
1888
+ self._handle_error(status_response, "extract-status")
1889
+
1890
+ time.sleep(2) # Polling interval
1891
+ else:
1892
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1893
+ else:
1894
+ self._handle_error(response, "extract")
1895
+ except Exception as e:
1896
+ raise ValueError(str(e), 500)
1897
+
1898
+ return ExtractResponse(success=False, error="Internal server error.")
1899
+
1900
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1901
+ """
1902
+ Retrieve the status of an extract job.
1903
+
1904
+ Args:
1905
+ job_id (str): The ID of the extract job.
1906
+
1907
+ Returns:
1908
+ ExtractResponse[Any]: The status of the extract job.
1909
+
1910
+ Raises:
1911
+ ValueError: If there is an error retrieving the status.
1912
+ """
1913
+ headers = self._prepare_headers()
1914
+ try:
1915
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1916
+ if response.status_code == 200:
1917
+ try:
1918
+ return ExtractResponse(**response.json())
1919
+ except:
1920
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1921
+ else:
1922
+ self._handle_error(response, "get extract status")
1923
+ except Exception as e:
1924
+ raise ValueError(str(e), 500)
1925
+
1926
+ def async_extract(
1927
+ self,
1928
+ urls: Optional[List[str]] = None,
1929
+ *,
1930
+ prompt: Optional[str] = None,
1931
+ schema: Optional[Any] = None,
1932
+ system_prompt: Optional[str] = None,
1933
+ allow_external_links: Optional[bool] = False,
1934
+ enable_web_search: Optional[bool] = False,
1935
+ show_sources: Optional[bool] = False,
1936
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1937
+ """
1938
+ Initiate an asynchronous extract job.
1939
+
1940
+ Args:
1941
+ urls (List[str]): URLs to extract information from
1942
+ prompt (Optional[str]): Custom extraction prompt
1943
+ schema (Optional[Any]): JSON schema/Pydantic model
1944
+ system_prompt (Optional[str]): System context
1945
+ allow_external_links (Optional[bool]): Follow external links
1946
+ enable_web_search (Optional[bool]): Enable web search
1947
+ show_sources (Optional[bool]): Include source URLs
1948
+ agent (Optional[Dict[str, Any]]): Agent configuration
1949
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1950
+
1951
+ Returns:
1952
+ ExtractResponse[Any] with:
1953
+ * success (bool): Whether request succeeded
1954
+ * data (Optional[Any]): Extracted data matching schema
1955
+ * error (Optional[str]): Error message if any
1956
+
1957
+ Raises:
1958
+ ValueError: If job initiation fails
1959
+ """
1960
+ headers = self._prepare_headers()
1961
+
1962
+ schema = schema
1963
+ if schema:
1964
+ schema = self._ensure_schema_dict(schema)
1965
+
1966
+ request_data = {
1967
+ 'urls': urls,
1968
+ 'allowExternalLinks': allow_external_links,
1969
+ 'enableWebSearch': enable_web_search,
1970
+ 'showSources': show_sources,
1971
+ 'schema': schema,
1972
+ 'origin': f'python-sdk@{version}'
1973
+ }
1974
+
1975
+ if prompt:
1976
+ request_data['prompt'] = prompt
1977
+ if system_prompt:
1978
+ request_data['systemPrompt'] = system_prompt
1979
+ if agent:
1980
+ request_data['agent'] = agent
1981
+
1982
+ try:
1983
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1984
+ if response.status_code == 200:
1985
+ try:
1986
+ return ExtractResponse(**response.json())
1987
+ except:
1988
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1989
+ else:
1990
+ self._handle_error(response, "async extract")
1991
+ except Exception as e:
1992
+ raise ValueError(str(e), 500)
1993
+
1994
+ def generate_llms_text(
1995
+ self,
1996
+ url: str,
1997
+ *,
1998
+ max_urls: Optional[int] = None,
1999
+ show_full_text: Optional[bool] = None,
2000
+ cache: Optional[bool] = None,
2001
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
2002
+ """
2003
+ Generate LLMs.txt for a given URL and poll until completion.
2004
+
2005
+ Args:
2006
+ url (str): Target URL to generate LLMs.txt from
2007
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2008
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2009
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2010
+ experimental_stream (Optional[bool]): Enable experimental streaming
2011
+
2012
+ Returns:
2013
+ GenerateLLMsTextStatusResponse with:
2014
+ * Generated LLMs.txt content
2015
+ * Full version if requested
2016
+ * Generation status
2017
+ * Success/error information
2018
+
2019
+ Raises:
2020
+ Exception: If generation fails
2021
+ """
2022
+ params = GenerateLLMsTextParams(
2023
+ maxUrls=max_urls,
2024
+ showFullText=show_full_text,
2025
+ cache=cache,
2026
+ __experimental_stream=experimental_stream
2027
+ )
2028
+
2029
+ response = self.async_generate_llms_text(
2030
+ url,
2031
+ max_urls=max_urls,
2032
+ show_full_text=show_full_text,
2033
+ cache=cache,
2034
+ experimental_stream=experimental_stream
2035
+ )
2036
+
2037
+ if not response.success or not response.id:
2038
+ return GenerateLLMsTextStatusResponse(
2039
+ success=False,
2040
+ error='Failed to start LLMs.txt generation',
2041
+ status='failed',
2042
+ expiresAt=''
2043
+ )
2044
+
2045
+ job_id = response.id
2046
+ while True:
2047
+ status = self.check_generate_llms_text_status(job_id)
2048
+
2049
+ if status.status == 'completed':
2050
+ return status
2051
+ elif status.status == 'failed':
2052
+ return status
2053
+ elif status.status != 'processing':
2054
+ return GenerateLLMsTextStatusResponse(
2055
+ success=False,
2056
+ error='LLMs.txt generation job terminated unexpectedly',
2057
+ status='failed',
2058
+ expiresAt=''
2059
+ )
2060
+
2061
+ time.sleep(2) # Polling interval
2062
+
2063
+ def async_generate_llms_text(
2064
+ self,
2065
+ url: str,
2066
+ *,
2067
+ max_urls: Optional[int] = None,
2068
+ show_full_text: Optional[bool] = None,
2069
+ cache: Optional[bool] = None,
2070
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
2071
+ """
2072
+ Initiate an asynchronous LLMs.txt generation operation.
2073
+
2074
+ Args:
2075
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2076
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2077
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2078
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2079
+ experimental_stream (Optional[bool]): Enable experimental streaming
2080
+
2081
+ Returns:
2082
+ GenerateLLMsTextResponse: A response containing:
2083
+ * success (bool): Whether the generation initiation was successful
2084
+ * id (str): The unique identifier for the generation job
2085
+ * error (str, optional): Error message if initiation failed
2086
+
2087
+ Raises:
2088
+ Exception: If the generation job initiation fails.
2089
+ """
2090
+ params = GenerateLLMsTextParams(
2091
+ maxUrls=max_urls,
2092
+ showFullText=show_full_text,
2093
+ cache=cache,
2094
+ __experimental_stream=experimental_stream
2095
+ )
2096
+
2097
+ headers = self._prepare_headers()
2098
+ json_data = {'url': url, **params.dict(by_alias=True, exclude_none=True)}
2099
+ json_data['origin'] = f"python-sdk@{version}"
2100
+
2101
+ try:
2102
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2103
+ response = req.json()
2104
+ print("json_data", json_data)
2105
+ print("response", response)
2106
+ if response.get('success'):
2107
+ try:
2108
+ return GenerateLLMsTextResponse(**response)
2109
+ except:
2110
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2111
+ else:
2112
+ self._handle_error(response, 'start LLMs.txt generation')
2113
+ except Exception as e:
2114
+ raise ValueError(str(e))
2115
+
2116
+ return GenerateLLMsTextResponse(
2117
+ success=False,
2118
+ error='Internal server error'
2119
+ )
2120
+
2121
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2122
+ """
2123
+ Check the status of a LLMs.txt generation operation.
2124
+
2125
+ Args:
2126
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
2127
+
2128
+ Returns:
2129
+ GenerateLLMsTextStatusResponse: A response containing:
2130
+ * success (bool): Whether the generation was successful
2131
+ * status (str): Status of generation ("processing", "completed", "failed")
2132
+ * data (Dict[str, str], optional): Generated text with fields:
2133
+ * llmstxt (str): Generated LLMs.txt content
2134
+ * llmsfulltxt (str, optional): Full version if requested
2135
+ * error (str, optional): Error message if generation failed
2136
+ * expiresAt (str): When the generated data expires
2137
+
2138
+ Raises:
2139
+ Exception: If the status check fails.
2140
+ """
2141
+ headers = self._prepare_headers()
2142
+ try:
2143
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2144
+ if response.status_code == 200:
2145
+ try:
2146
+ json_data = response.json()
2147
+ return GenerateLLMsTextStatusResponse(**json_data)
2148
+ except Exception as e:
2149
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2150
+ elif response.status_code == 404:
2151
+ raise Exception('LLMs.txt generation job not found')
2152
+ else:
2153
+ self._handle_error(response, 'check LLMs.txt generation status')
2154
+ except Exception as e:
2155
+ raise ValueError(str(e))
2156
+
2157
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2158
+
2159
+ def _prepare_headers(
2160
+ self,
2161
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
2162
+ """
2163
+ Prepare the headers for API requests.
2164
+
2165
+ Args:
2166
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2167
+
2168
+ Returns:
2169
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2170
+ """
2171
+ if idempotency_key:
2172
+ return {
2173
+ 'Content-Type': 'application/json',
2174
+ 'Authorization': f'Bearer {self.api_key}',
2175
+ 'x-idempotency-key': idempotency_key
2176
+ }
2177
+
2178
+ return {
2179
+ 'Content-Type': 'application/json',
2180
+ 'Authorization': f'Bearer {self.api_key}',
2181
+ }
2182
+
2183
+ def _post_request(
2184
+ self,
2185
+ url: str,
2186
+ data: Dict[str, Any],
2187
+ headers: Dict[str, str],
2188
+ retries: int = 3,
2189
+ backoff_factor: float = 0.5) -> requests.Response:
2190
+ """
2191
+ Make a POST request with retries.
2192
+
2193
+ Args:
2194
+ url (str): The URL to send the POST request to.
2195
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2196
+ headers (Dict[str, str]): The headers to include in the POST request.
2197
+ retries (int): Number of retries for the request.
2198
+ backoff_factor (float): Backoff factor for retries.
2199
+
2200
+ Returns:
2201
+ requests.Response: The response from the POST request.
2202
+
2203
+ Raises:
2204
+ requests.RequestException: If the request fails after the specified retries.
2205
+ """
2206
+ for attempt in range(retries):
2207
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] / 1000.0 + 5) if "timeout" in data and data["timeout"] is not None else None))
2208
+ if response.status_code == 502:
2209
+ time.sleep(backoff_factor * (2 ** attempt))
2210
+ else:
2211
+ return response
2212
+ return response
2213
+
2214
+ def _get_request(
2215
+ self,
2216
+ url: str,
2217
+ headers: Dict[str, str],
2218
+ retries: int = 3,
2219
+ backoff_factor: float = 0.5) -> requests.Response:
2220
+ """
2221
+ Make a GET request with retries.
2222
+
2223
+ Args:
2224
+ url (str): The URL to send the GET request to.
2225
+ headers (Dict[str, str]): The headers to include in the GET request.
2226
+ retries (int): Number of retries for the request.
2227
+ backoff_factor (float): Backoff factor for retries.
2228
+
2229
+ Returns:
2230
+ requests.Response: The response from the GET request.
2231
+
2232
+ Raises:
2233
+ requests.RequestException: If the request fails after the specified retries.
2234
+ """
2235
+ for attempt in range(retries):
2236
+ response = requests.get(url, headers=headers)
2237
+ if response.status_code == 502:
2238
+ time.sleep(backoff_factor * (2 ** attempt))
2239
+ else:
2240
+ return response
2241
+ return response
2242
+
2243
+ def _delete_request(
2244
+ self,
2245
+ url: str,
2246
+ headers: Dict[str, str],
2247
+ retries: int = 3,
2248
+ backoff_factor: float = 0.5) -> requests.Response:
2249
+ """
2250
+ Make a DELETE request with retries.
2251
+
2252
+ Args:
2253
+ url (str): The URL to send the DELETE request to.
2254
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2255
+ retries (int): Number of retries for the request.
2256
+ backoff_factor (float): Backoff factor for retries.
2257
+
2258
+ Returns:
2259
+ requests.Response: The response from the DELETE request.
2260
+
2261
+ Raises:
2262
+ requests.RequestException: If the request fails after the specified retries.
2263
+ """
2264
+ for attempt in range(retries):
2265
+ response = requests.delete(url, headers=headers)
2266
+ if response.status_code == 502:
2267
+ time.sleep(backoff_factor * (2 ** attempt))
2268
+ else:
2269
+ return response
2270
+ return response
2271
+
2272
+ def _monitor_job_status(
2273
+ self,
2274
+ id: str,
2275
+ headers: Dict[str, str],
2276
+ poll_interval: int) -> CrawlStatusResponse:
2277
+ """
2278
+ Monitor the status of a crawl job until completion.
2279
+
2280
+ Args:
2281
+ id (str): The ID of the crawl job.
2282
+ headers (Dict[str, str]): The headers to include in the status check requests.
2283
+ poll_interval (int): Seconds between status checks.
2284
+
2285
+ Returns:
2286
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2287
+
2288
+ Raises:
2289
+ Exception: If the job fails or an error occurs during status checks.
2290
+ """
2291
+ while True:
2292
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2293
+
2294
+ status_response = self._get_request(api_url, headers)
2295
+ if status_response.status_code == 200:
2296
+ try:
2297
+ status_data = status_response.json()
2298
+ except:
2299
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2300
+ if status_data['status'] == 'completed':
2301
+ if 'data' in status_data:
2302
+ data = status_data['data']
2303
+ while 'next' in status_data:
2304
+ if len(status_data['data']) == 0:
2305
+ break
2306
+ status_response = self._get_request(status_data['next'], headers)
2307
+ try:
2308
+ status_data = status_response.json()
2309
+ except:
2310
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2311
+ data.extend(status_data.get('data', []))
2312
+ status_data['data'] = data
2313
+ return CrawlStatusResponse(**status_data)
2314
+ else:
2315
+ raise Exception('Crawl job completed but no data was returned')
2316
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2317
+ poll_interval=max(poll_interval,2)
2318
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2319
+ else:
2320
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2321
+ else:
2322
+ self._handle_error(status_response, 'check crawl status')
2323
+
2324
+ def _handle_error(
2325
+ self,
2326
+ response: requests.Response,
2327
+ action: str) -> None:
2328
+ """
2329
+ Handle errors from API responses.
2330
+
2331
+ Args:
2332
+ response (requests.Response): The response object from the API request.
2333
+ action (str): Description of the action that was being performed.
2334
+
2335
+ Raises:
2336
+ Exception: An exception with a message containing the status code and error details from the response.
2337
+ """
2338
+ try:
2339
+ response_json = response.json()
2340
+ error_message = response_json.get('error', 'No error message provided.')
2341
+ error_details = response_json.get('details', 'No additional error details provided.')
2342
+ except:
2343
+ # If we can't parse JSON, provide a helpful error message with response content
2344
+ try:
2345
+ response_text = response.text[:500] # Limit to first 500 chars
2346
+ if response_text.strip():
2347
+ error_message = f"Server returned non-JSON response: {response_text}"
2348
+ error_details = f"Full response status: {response.status_code}"
2349
+ else:
2350
+ error_message = f"Server returned empty response with status {response.status_code}"
2351
+ error_details = "No additional details available"
2352
+ except ValueError:
2353
+ error_message = f"Server returned unreadable response with status {response.status_code}"
2354
+ error_details = "No additional details available"
2355
+
2356
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2357
+
2358
+ # Raise an HTTPError with the custom message and attach the response
2359
+ raise requests.exceptions.HTTPError(message, response=response)
2360
+
2361
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2362
+ """
2363
+ Generate a standardized error message based on HTTP status code.
2364
+
2365
+ Args:
2366
+ status_code (int): The HTTP status code from the response
2367
+ action (str): Description of the action that was being performed
2368
+ error_message (str): The error message from the API response
2369
+ error_details (str): Additional error details from the API response
2370
+
2371
+ Returns:
2372
+ str: A formatted error message
2373
+ """
2374
+ if status_code == 402:
2375
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2376
+ elif status_code == 403:
2377
+ return f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2378
+ elif status_code == 408:
2379
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2380
+ elif status_code == 409:
2381
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2382
+ elif status_code == 500:
2383
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2384
+ else:
2385
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2386
+
2387
+ def deep_research(
2388
+ self,
2389
+ query: str,
2390
+ *,
2391
+ max_depth: Optional[int] = None,
2392
+ time_limit: Optional[int] = None,
2393
+ max_urls: Optional[int] = None,
2394
+ analysis_prompt: Optional[str] = None,
2395
+ system_prompt: Optional[str] = None,
2396
+ __experimental_stream_steps: Optional[bool] = None,
2397
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2398
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2399
+ """
2400
+ Initiates a deep research operation on a given query and polls until completion.
2401
+
2402
+ Args:
2403
+ query (str): Research query or topic to investigate
2404
+ max_depth (Optional[int]): Maximum depth of research exploration
2405
+ time_limit (Optional[int]): Time limit in seconds for research
2406
+ max_urls (Optional[int]): Maximum number of URLs to process
2407
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2408
+ system_prompt (Optional[str]): Custom system prompt
2409
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2410
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2411
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2412
+
2413
+ Returns:
2414
+ DeepResearchStatusResponse containing:
2415
+ * success (bool): Whether research completed successfully
2416
+ * status (str): Current state (processing/completed/failed)
2417
+ * error (Optional[str]): Error message if failed
2418
+ * id (str): Unique identifier for the research job
2419
+ * data (Any): Research findings and analysis
2420
+ * sources (List[Dict]): List of discovered sources
2421
+ * activities (List[Dict]): Research progress log
2422
+ * summaries (List[str]): Generated research summaries
2423
+
2424
+ Raises:
2425
+ Exception: If research fails
2426
+ """
2427
+ research_params = {}
2428
+ if max_depth is not None:
2429
+ research_params['maxDepth'] = max_depth
2430
+ if time_limit is not None:
2431
+ research_params['timeLimit'] = time_limit
2432
+ if max_urls is not None:
2433
+ research_params['maxUrls'] = max_urls
2434
+ if analysis_prompt is not None:
2435
+ research_params['analysisPrompt'] = analysis_prompt
2436
+ if system_prompt is not None:
2437
+ research_params['systemPrompt'] = system_prompt
2438
+ if __experimental_stream_steps is not None:
2439
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2440
+ research_params = DeepResearchParams(**research_params)
2441
+
2442
+ response = self.async_deep_research(
2443
+ query,
2444
+ max_depth=max_depth,
2445
+ time_limit=time_limit,
2446
+ max_urls=max_urls,
2447
+ analysis_prompt=analysis_prompt,
2448
+ system_prompt=system_prompt
2449
+ )
2450
+ if not response.get('success') or 'id' not in response:
2451
+ return response
2452
+
2453
+ job_id = response['id']
2454
+ last_activity_count = 0
2455
+ last_source_count = 0
2456
+
2457
+ while True:
2458
+ status = self.check_deep_research_status(job_id)
2459
+
2460
+ if on_activity and 'activities' in status:
2461
+ new_activities = status['activities'][last_activity_count:]
2462
+ for activity in new_activities:
2463
+ on_activity(activity)
2464
+ last_activity_count = len(status['activities'])
2465
+
2466
+ if on_source and 'sources' in status:
2467
+ new_sources = status['sources'][last_source_count:]
2468
+ for source in new_sources:
2469
+ on_source(source)
2470
+ last_source_count = len(status['sources'])
2471
+
2472
+ if status['status'] == 'completed':
2473
+ return status
2474
+ elif status['status'] == 'failed':
2475
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2476
+ elif status['status'] != 'processing':
2477
+ break
2478
+
2479
+ time.sleep(2) # Polling interval
2480
+
2481
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2482
+
2483
+ def async_deep_research(
2484
+ self,
2485
+ query: str,
2486
+ *,
2487
+ max_depth: Optional[int] = None,
2488
+ time_limit: Optional[int] = None,
2489
+ max_urls: Optional[int] = None,
2490
+ analysis_prompt: Optional[str] = None,
2491
+ system_prompt: Optional[str] = None,
2492
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2493
+ """
2494
+ Initiates an asynchronous deep research operation.
2495
+
2496
+ Args:
2497
+ query (str): Research query or topic to investigate
2498
+ max_depth (Optional[int]): Maximum depth of research exploration
2499
+ time_limit (Optional[int]): Time limit in seconds for research
2500
+ max_urls (Optional[int]): Maximum number of URLs to process
2501
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2502
+ system_prompt (Optional[str]): Custom system prompt
2503
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2504
+
2505
+ Returns:
2506
+ Dict[str, Any]: A response containing:
2507
+ * success (bool): Whether the research initiation was successful
2508
+ * id (str): The unique identifier for the research job
2509
+ * error (str, optional): Error message if initiation failed
2510
+
2511
+ Raises:
2512
+ Exception: If the research initiation fails.
2513
+ """
2514
+ research_params = {}
2515
+ if max_depth is not None:
2516
+ research_params['maxDepth'] = max_depth
2517
+ if time_limit is not None:
2518
+ research_params['timeLimit'] = time_limit
2519
+ if max_urls is not None:
2520
+ research_params['maxUrls'] = max_urls
2521
+ if analysis_prompt is not None:
2522
+ research_params['analysisPrompt'] = analysis_prompt
2523
+ if system_prompt is not None:
2524
+ research_params['systemPrompt'] = system_prompt
2525
+ if __experimental_stream_steps is not None:
2526
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2527
+ research_params = DeepResearchParams(**research_params)
2528
+
2529
+ headers = self._prepare_headers()
2530
+
2531
+ json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)}
2532
+ json_data['origin'] = f"python-sdk@{version}"
2533
+
2534
+ # Handle json options schema if present
2535
+ if 'jsonOptions' in json_data:
2536
+ json_opts = json_data['jsonOptions']
2537
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2538
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2539
+
2540
+ try:
2541
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2542
+ if response.status_code == 200:
2543
+ try:
2544
+ return response.json()
2545
+ except:
2546
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2547
+ else:
2548
+ self._handle_error(response, 'start deep research')
2549
+ except Exception as e:
2550
+ raise ValueError(str(e))
2551
+
2552
+ return {'success': False, 'error': 'Internal server error'}
2553
+
2554
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2555
+ """
2556
+ Check the status of a deep research operation.
2557
+
2558
+ Args:
2559
+ id (str): The ID of the deep research operation.
2560
+
2561
+ Returns:
2562
+ DeepResearchResponse containing:
2563
+
2564
+ Status:
2565
+ * success - Whether research completed successfully
2566
+ * status - Current state (processing/completed/failed)
2567
+ * error - Error message if failed
2568
+
2569
+ Results:
2570
+ * id - Unique identifier for the research job
2571
+ * data - Research findings and analysis
2572
+ * sources - List of discovered sources
2573
+ * activities - Research progress log
2574
+ * summaries - Generated research summaries
2575
+
2576
+ Raises:
2577
+ Exception: If the status check fails.
2578
+ """
2579
+ headers = self._prepare_headers()
2580
+ try:
2581
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2582
+ if response.status_code == 200:
2583
+ try:
2584
+ return response.json()
2585
+ except:
2586
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2587
+ elif response.status_code == 404:
2588
+ raise Exception('Deep research job not found')
2589
+ else:
2590
+ self._handle_error(response, 'check deep research status')
2591
+ except Exception as e:
2592
+ raise ValueError(str(e))
2593
+
2594
+ return {'success': False, 'error': 'Internal server error'}
2595
+
2596
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2597
+ """
2598
+ Validate additional keyword arguments before they are passed to the API.
2599
+ This provides early validation before the Pydantic model validation.
2600
+
2601
+ Args:
2602
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2603
+ method_name (str): Name of the method these kwargs are for
2604
+
2605
+ Raises:
2606
+ ValueError: If kwargs contain invalid or unsupported parameters
2607
+ """
2608
+ if not kwargs:
2609
+ return
2610
+
2611
+ # Known parameter mappings for each method
2612
+ method_params = {
2613
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2614
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2615
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "max_age", "agent", "integration"},
2616
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
2617
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2618
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2619
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url", "integration"},
2620
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout", "integration"},
2621
+ "extract": {"prompt", "schema", "system_prompt", "allow_external_links", "enable_web_search", "show_sources", "agent", "integration"},
2622
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2623
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2624
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2625
+ "actions", "agent", "webhook"},
2626
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2627
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2628
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2629
+ "actions", "agent", "webhook"},
2630
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2631
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2632
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2633
+ "actions", "agent", "webhook"}
2634
+ }
2635
+
2636
+ # Get allowed parameters for this method
2637
+ allowed_params = method_params.get(method_name, set())
2638
+
2639
+ # Check for unknown parameters
2640
+ unknown_params = set(kwargs.keys()) - allowed_params
2641
+ if unknown_params:
2642
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2643
+
2644
+ # Additional type validation can be added here if needed
2645
+ # For now, we rely on Pydantic models for detailed type validation
2646
+
2647
+ def _ensure_schema_dict(self, schema):
2648
+ """
2649
+ Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2650
+ """
2651
+ if schema is None:
2652
+ return schema
2653
+ if isinstance(schema, type):
2654
+ # Pydantic v1/v2 model class
2655
+ if hasattr(schema, 'model_json_schema'):
2656
+ return schema.model_json_schema()
2657
+ elif hasattr(schema, 'schema'):
2658
+ return schema.schema()
2659
+ if isinstance(schema, dict):
2660
+ return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2661
+ if isinstance(schema, (list, tuple)):
2662
+ return [self._ensure_schema_dict(v) for v in schema]
2663
+ return schema
2664
+
2665
+ class CrawlWatcher:
2666
+ """
2667
+ A class to watch and handle crawl job events via WebSocket connection.
2668
+
2669
+ Attributes:
2670
+ id (str): The ID of the crawl job to watch
2671
+ app (FirecrawlApp): The FirecrawlApp instance
2672
+ data (List[Dict[str, Any]]): List of crawled documents/data
2673
+ status (str): Current status of the crawl job
2674
+ ws_url (str): WebSocket URL for the crawl job
2675
+ event_handlers (dict): Dictionary of event type to list of handler functions
2676
+ """
2677
+ def __init__(self, id: str, app: FirecrawlApp):
2678
+ self.id = id
2679
+ self.app = app
2680
+ self.data: List[Dict[str, Any]] = []
2681
+ self.status = "scraping"
2682
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2683
+ self.event_handlers = {
2684
+ 'done': [],
2685
+ 'error': [],
2686
+ 'document': []
2687
+ }
2688
+
2689
+ async def connect(self) -> None:
2690
+ """
2691
+ Establishes WebSocket connection and starts listening for messages.
2692
+ """
2693
+ async with websockets.connect(
2694
+ self.ws_url,
2695
+ max_size=None,
2696
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2697
+ ) as websocket:
2698
+ await self._listen(websocket)
2699
+
2700
+ async def _listen(self, websocket) -> None:
2701
+ """
2702
+ Listens for incoming WebSocket messages and handles them.
2703
+
2704
+ Args:
2705
+ websocket: The WebSocket connection object
2706
+ """
2707
+ async for message in websocket:
2708
+ msg = json.loads(message)
2709
+ await self._handle_message(msg)
2710
+
2711
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2712
+ """
2713
+ Adds an event handler function for a specific event type.
2714
+
2715
+ Args:
2716
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2717
+ handler (Callable): Function to handle the event
2718
+ """
2719
+ if event_type in self.event_handlers:
2720
+ self.event_handlers[event_type].append(handler)
2721
+
2722
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2723
+ """
2724
+ Dispatches an event to all registered handlers for that event type.
2725
+
2726
+ Args:
2727
+ event_type (str): Type of event to dispatch
2728
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2729
+ """
2730
+ if event_type in self.event_handlers:
2731
+ for handler in self.event_handlers[event_type]:
2732
+ handler(detail)
2733
+
2734
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2735
+ """
2736
+ Handles incoming WebSocket messages based on their type.
2737
+
2738
+ Args:
2739
+ msg (Dict[str, Any]): The message to handle
2740
+ """
2741
+ if msg['type'] == 'done':
2742
+ self.status = 'completed'
2743
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2744
+ elif msg['type'] == 'error':
2745
+ self.status = 'failed'
2746
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2747
+ elif msg['type'] == 'catchup':
2748
+ self.status = msg['data']['status']
2749
+ self.data.extend(msg['data'].get('data', []))
2750
+ for doc in self.data:
2751
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2752
+ elif msg['type'] == 'document':
2753
+ self.data.append(msg['data'])
2754
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2755
+
2756
+ class AsyncFirecrawlApp(FirecrawlApp):
2757
+ """
2758
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2759
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2760
+ """
2761
+
2762
+ async def _async_request(
2763
+ self,
2764
+ method: str,
2765
+ url: str,
2766
+ headers: Dict[str, str],
2767
+ data: Optional[Dict[str, Any]] = None,
2768
+ retries: int = 3,
2769
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2770
+ """
2771
+ Generic async request method with exponential backoff retry logic.
2772
+
2773
+ Args:
2774
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2775
+ url (str): The URL to send the request to.
2776
+ headers (Dict[str, str]): Headers to include in the request.
2777
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2778
+ retries (int): Maximum number of retry attempts (default: 3).
2779
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2780
+ Delay will be backoff_factor * (2 ** retry_count).
2781
+
2782
+ Returns:
2783
+ Dict[str, Any]: The parsed JSON response from the server.
2784
+
2785
+ Raises:
2786
+ aiohttp.ClientError: If the request fails after all retries.
2787
+ Exception: If max retries are exceeded or other errors occur.
2788
+ """
2789
+ async with aiohttp.ClientSession() as session:
2790
+ for attempt in range(retries):
2791
+ try:
2792
+ async with session.request(
2793
+ method=method, url=url, headers=headers, json=data
2794
+ ) as response:
2795
+ if response.status == 502:
2796
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2797
+ continue
2798
+ if response.status >= 300:
2799
+ await self._handle_error(response, f"make {method} request")
2800
+ return await response.json()
2801
+ except aiohttp.ClientError as e:
2802
+ if attempt == retries - 1:
2803
+ raise e
2804
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2805
+ raise Exception("Max retries exceeded")
2806
+
2807
+ async def _async_post_request(
2808
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2809
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2810
+ """
2811
+ Make an async POST request with exponential backoff retry logic.
2812
+
2813
+ Args:
2814
+ url (str): The URL to send the POST request to.
2815
+ data (Dict[str, Any]): The JSON data to include in the request body.
2816
+ headers (Dict[str, str]): Headers to include in the request.
2817
+ retries (int): Maximum number of retry attempts (default: 3).
2818
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2819
+ Delay will be backoff_factor * (2 ** retry_count).
2820
+
2821
+ Returns:
2822
+ Dict[str, Any]: The parsed JSON response from the server.
2823
+
2824
+ Raises:
2825
+ aiohttp.ClientError: If the request fails after all retries.
2826
+ Exception: If max retries are exceeded or other errors occur.
2827
+ """
2828
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2829
+
2830
+ async def _async_get_request(
2831
+ self, url: str, headers: Dict[str, str],
2832
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2833
+ """
2834
+ Make an async GET request with exponential backoff retry logic.
2835
+
2836
+ Args:
2837
+ url (str): The URL to send the GET request to.
2838
+ headers (Dict[str, str]): Headers to include in the request.
2839
+ retries (int): Maximum number of retry attempts (default: 3).
2840
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2841
+ Delay will be backoff_factor * (2 ** retry_count).
2842
+
2843
+ Returns:
2844
+ Dict[str, Any]: The parsed JSON response from the server.
2845
+
2846
+ Raises:
2847
+ aiohttp.ClientError: If the request fails after all retries.
2848
+ Exception: If max retries are exceeded or other errors occur.
2849
+ """
2850
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2851
+
2852
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2853
+ """
2854
+ Handle errors from async API responses with detailed error messages.
2855
+
2856
+ Args:
2857
+ response (aiohttp.ClientResponse): The response object from the failed request
2858
+ action (str): Description of the action that was being attempted
2859
+
2860
+ Raises:
2861
+ aiohttp.ClientError: With a detailed error message based on the response status:
2862
+ - 402: Payment Required
2863
+ - 408: Request Timeout
2864
+ - 409: Conflict
2865
+ - 500: Internal Server Error
2866
+ - Other: Unexpected error with status code
2867
+ """
2868
+ try:
2869
+ error_data = await response.json()
2870
+ error_message = error_data.get('error', 'No error message provided.')
2871
+ error_details = error_data.get('details', 'No additional error details provided.')
2872
+ except:
2873
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2874
+
2875
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2876
+
2877
+ raise aiohttp.ClientError(message)
2878
+
2879
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2880
+ """
2881
+ Generate a standardized error message based on HTTP status code for async operations.
2882
+
2883
+ Args:
2884
+ status_code (int): The HTTP status code from the response
2885
+ action (str): Description of the action that was being performed
2886
+ error_message (str): The error message from the API response
2887
+ error_details (str): Additional error details from the API response
2888
+
2889
+ Returns:
2890
+ str: A formatted error message
2891
+ """
2892
+ return self._get_error_message(status_code, action, error_message, error_details)
2893
+
2894
+ async def crawl_url_and_watch(
2895
+ self,
2896
+ url: str,
2897
+ params: Optional[CrawlParams] = None,
2898
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2899
+ """
2900
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2901
+
2902
+ Args:
2903
+ url (str): Target URL to start crawling from
2904
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2905
+ URL Discovery:
2906
+ * includePaths - Patterns of URLs to include
2907
+ * excludePaths - Patterns of URLs to exclude
2908
+ * maxDepth - Maximum crawl depth
2909
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2910
+ * limit - Maximum pages to crawl
2911
+
2912
+ Link Following:
2913
+ * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2914
+ * crawlEntireDomain - Follow parent directory links
2915
+ * allowExternalLinks - Follow external domain links
2916
+ * ignoreSitemap - Skip sitemap.xml processing
2917
+
2918
+ Advanced:
2919
+ * scrapeOptions - Page scraping configuration
2920
+ * webhook - Notification webhook settings
2921
+ * deduplicateSimilarURLs - Remove similar URLs
2922
+ * ignoreQueryParameters - Ignore URL parameters
2923
+ * regexOnFullURL - Apply regex to full URLs
2924
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2925
+
2926
+ Returns:
2927
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2928
+
2929
+ Raises:
2930
+ Exception: If crawl job fails to start
2931
+ """
2932
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2933
+ if crawl_response.get('success') and 'id' in crawl_response:
2934
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2935
+ else:
2936
+ raise Exception("Crawl job failed to start")
2937
+
2938
+ async def batch_scrape_urls_and_watch(
2939
+ self,
2940
+ urls: List[str],
2941
+ params: Optional[ScrapeParams] = None,
2942
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2943
+ """
2944
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2945
+
2946
+ Args:
2947
+ urls (List[str]): List of URLs to scrape
2948
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2949
+
2950
+ Content Options:
2951
+ * formats - Content formats to retrieve
2952
+ * includeTags - HTML tags to include
2953
+ * excludeTags - HTML tags to exclude
2954
+ * onlyMainContent - Extract main content only
2955
+
2956
+ Request Options:
2957
+ * headers - Custom HTTP headers
2958
+ * timeout - Request timeout (ms)
2959
+ * mobile - Use mobile user agent
2960
+ * proxy - Proxy type
2961
+
2962
+ Extraction Options:
2963
+ * extract - Content extraction config
2964
+ * jsonOptions - JSON extraction config
2965
+ * actions - Actions to perform
2966
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2967
+
2968
+ Returns:
2969
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2970
+
2971
+ Raises:
2972
+ Exception: If batch scrape job fails to start
2973
+ """
2974
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2975
+ if batch_response.get('success') and 'id' in batch_response:
2976
+ return AsyncCrawlWatcher(batch_response['id'], self)
2977
+ else:
2978
+ raise Exception("Batch scrape job failed to start")
2979
+
2980
+ async def scrape_url(
2981
+ self,
2982
+ url: str,
2983
+ *,
2984
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2985
+ headers: Optional[Dict[str, str]] = None,
2986
+ include_tags: Optional[List[str]] = None,
2987
+ exclude_tags: Optional[List[str]] = None,
2988
+ only_main_content: Optional[bool] = None,
2989
+ wait_for: Optional[int] = None,
2990
+ timeout: Optional[int] = 30000,
2991
+ location: Optional[LocationConfig] = None,
2992
+ mobile: Optional[bool] = None,
2993
+ skip_tls_verification: Optional[bool] = None,
2994
+ remove_base64_images: Optional[bool] = None,
2995
+ block_ads: Optional[bool] = None,
2996
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2997
+ parse_pdf: Optional[bool] = None,
2998
+ extract: Optional[JsonConfig] = None,
2999
+ json_options: Optional[JsonConfig] = None,
3000
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3001
+ agent: Optional[AgentOptions] = None,
3002
+ **kwargs) -> ScrapeResponse[Any]:
3003
+ """
3004
+ Scrape a single URL asynchronously.
3005
+
3006
+ Args:
3007
+ url (str): Target URL to scrape
3008
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
3009
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3010
+ include_tags (Optional[List[str]]): HTML tags to include
3011
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3012
+ only_main_content (Optional[bool]): Extract main content only
3013
+ wait_for (Optional[int]): Wait for a specific element to appear
3014
+ timeout (Optional[int]): Request timeout (ms)
3015
+ location (Optional[LocationConfig]): Location configuration
3016
+ mobile (Optional[bool]): Use mobile user agent
3017
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3018
+ remove_base64_images (Optional[bool]): Remove base64 images
3019
+ block_ads (Optional[bool]): Block ads
3020
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
3021
+ extract (Optional[JsonConfig]): Content extraction settings
3022
+ json_options (Optional[JsonConfig]): JSON extraction settings
3023
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
3024
+ agent (Optional[AgentOptions]): Agent configuration for FIRE-1 model
3025
+ **kwargs: Additional parameters to pass to the API
3026
+
3027
+ Returns:
3028
+ ScrapeResponse with:
3029
+ * success - Whether scrape was successful
3030
+ * markdown - Markdown content if requested
3031
+ * html - HTML content if requested
3032
+ * rawHtml - Raw HTML content if requested
3033
+ * links - Extracted links if requested
3034
+ * screenshot - Screenshot if requested
3035
+ * extract - Extracted data if requested
3036
+ * json - JSON data if requested
3037
+ * error - Error message if scrape failed
3038
+
3039
+ Raises:
3040
+ Exception: If scraping fails
3041
+ """
3042
+ # Validate any additional kwargs
3043
+ self._validate_kwargs(kwargs, "scrape_url")
3044
+
3045
+ _headers = self._prepare_headers()
3046
+
3047
+ # Build scrape parameters
3048
+ scrape_params = {
3049
+ 'url': url,
3050
+ 'origin': f"python-sdk@{version}"
3051
+ }
3052
+
3053
+ # Add optional parameters if provided and not None
3054
+ if formats:
3055
+ scrape_params['formats'] = formats
3056
+ if headers:
3057
+ scrape_params['headers'] = headers
3058
+ if include_tags:
3059
+ scrape_params['includeTags'] = include_tags
3060
+ if exclude_tags:
3061
+ scrape_params['excludeTags'] = exclude_tags
3062
+ if only_main_content is not None:
3063
+ scrape_params['onlyMainContent'] = only_main_content
3064
+ if wait_for:
3065
+ scrape_params['waitFor'] = wait_for
3066
+ if timeout:
3067
+ scrape_params['timeout'] = timeout
3068
+ if location:
3069
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
3070
+ if mobile is not None:
3071
+ scrape_params['mobile'] = mobile
3072
+ if skip_tls_verification is not None:
3073
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3074
+ if remove_base64_images is not None:
3075
+ scrape_params['removeBase64Images'] = remove_base64_images
3076
+ if block_ads is not None:
3077
+ scrape_params['blockAds'] = block_ads
3078
+ if proxy:
3079
+ scrape_params['proxy'] = proxy
3080
+ if parse_pdf is not None:
3081
+ scrape_params['parsePDF'] = parse_pdf
3082
+ if extract is not None:
3083
+ extract = self._ensure_schema_dict(extract)
3084
+ if isinstance(extract, dict) and "schema" in extract:
3085
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3086
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
3087
+ if json_options is not None:
3088
+ json_options = self._ensure_schema_dict(json_options)
3089
+ if isinstance(json_options, dict) and "schema" in json_options:
3090
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3091
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
3092
+ if actions:
3093
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
3094
+ if agent is not None:
3095
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
3096
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
3097
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
3098
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3099
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3100
+
3101
+ # Make async request
3102
+ endpoint = f'/v1/scrape'
3103
+ response = await self._async_post_request(
3104
+ f'{self.api_url}{endpoint}',
3105
+ scrape_params,
3106
+ _headers
3107
+ )
3108
+
3109
+ if response.get('success') and 'data' in response:
3110
+ return ScrapeResponse(**response['data'])
3111
+ elif "error" in response:
3112
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3113
+ else:
3114
+ # Use the response content directly if possible, otherwise a generic message
3115
+ error_content = response.get('error', str(response))
3116
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
3117
+
3118
+ async def batch_scrape_urls(
3119
+ self,
3120
+ urls: List[str],
3121
+ *,
3122
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3123
+ headers: Optional[Dict[str, str]] = None,
3124
+ include_tags: Optional[List[str]] = None,
3125
+ exclude_tags: Optional[List[str]] = None,
3126
+ only_main_content: Optional[bool] = None,
3127
+ wait_for: Optional[int] = None,
3128
+ timeout: Optional[int] = 30000,
3129
+ location: Optional[LocationConfig] = None,
3130
+ mobile: Optional[bool] = None,
3131
+ skip_tls_verification: Optional[bool] = None,
3132
+ remove_base64_images: Optional[bool] = None,
3133
+ block_ads: Optional[bool] = None,
3134
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3135
+ extract: Optional[JsonConfig] = None,
3136
+ json_options: Optional[JsonConfig] = None,
3137
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3138
+ agent: Optional[AgentOptions] = None,
3139
+ poll_interval: Optional[int] = 2,
3140
+ idempotency_key: Optional[str] = None,
3141
+ **kwargs
3142
+ ) -> BatchScrapeStatusResponse:
3143
+ """
3144
+ Asynchronously scrape multiple URLs and monitor until completion.
3145
+
3146
+ Args:
3147
+ urls (List[str]): URLs to scrape
3148
+ formats (Optional[List[Literal]]): Content formats to retrieve
3149
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3150
+ include_tags (Optional[List[str]]): HTML tags to include
3151
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3152
+ only_main_content (Optional[bool]): Extract main content only
3153
+ wait_for (Optional[int]): Wait time in milliseconds
3154
+ timeout (Optional[int]): Request timeout in milliseconds
3155
+ location (Optional[LocationConfig]): Location configuration
3156
+ mobile (Optional[bool]): Use mobile user agent
3157
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3158
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3159
+ block_ads (Optional[bool]): Block advertisements
3160
+ proxy (Optional[Literal]): Proxy type to use
3161
+ extract (Optional[JsonConfig]): Content extraction config
3162
+ json_options (Optional[JsonConfig]): JSON extraction config
3163
+ actions (Optional[List[Union]]): Actions to perform
3164
+ agent (Optional[AgentOptions]): Agent configuration
3165
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3166
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3167
+ **kwargs: Additional parameters to pass to the API
3168
+
3169
+ Returns:
3170
+ BatchScrapeStatusResponse with:
3171
+ * Scraping status and progress
3172
+ * Scraped content for each URL
3173
+ * Success/error information
3174
+
3175
+ Raises:
3176
+ Exception: If batch scrape fails
3177
+ """
3178
+ # Validate any additional kwargs
3179
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
3180
+
3181
+ scrape_params = {}
3182
+
3183
+ # Add individual parameters
3184
+ if formats is not None:
3185
+ scrape_params['formats'] = formats
3186
+ if headers is not None:
3187
+ scrape_params['headers'] = headers
3188
+ if include_tags is not None:
3189
+ scrape_params['includeTags'] = include_tags
3190
+ if exclude_tags is not None:
3191
+ scrape_params['excludeTags'] = exclude_tags
3192
+ if only_main_content is not None:
3193
+ scrape_params['onlyMainContent'] = only_main_content
3194
+ if wait_for is not None:
3195
+ scrape_params['waitFor'] = wait_for
3196
+ if timeout is not None:
3197
+ scrape_params['timeout'] = timeout
3198
+ if location is not None:
3199
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
3200
+ if mobile is not None:
3201
+ scrape_params['mobile'] = mobile
3202
+ if skip_tls_verification is not None:
3203
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3204
+ if remove_base64_images is not None:
3205
+ scrape_params['removeBase64Images'] = remove_base64_images
3206
+ if block_ads is not None:
3207
+ scrape_params['blockAds'] = block_ads
3208
+ if proxy is not None:
3209
+ scrape_params['proxy'] = proxy
3210
+ if extract is not None:
3211
+ extract = self._ensure_schema_dict(extract)
3212
+ if isinstance(extract, dict) and "schema" in extract:
3213
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3214
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
3215
+ if json_options is not None:
3216
+ json_options = self._ensure_schema_dict(json_options)
3217
+ if isinstance(json_options, dict) and "schema" in json_options:
3218
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3219
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
3220
+ if actions is not None:
3221
+ scrape_params['actions'] = [action.dict(by_alias=True, exclude_none=True) for action in actions]
3222
+ if agent is not None:
3223
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
3224
+
3225
+ # Add any additional kwargs
3226
+ scrape_params.update(kwargs)
3227
+
3228
+ # Create final params object
3229
+ final_params = ScrapeParams(**scrape_params)
3230
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3231
+ params_dict['urls'] = urls
3232
+ params_dict['origin'] = f"python-sdk@{version}"
3233
+
3234
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3235
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3236
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3237
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3238
+
3239
+ # Make request
3240
+ headers = self._prepare_headers(idempotency_key)
3241
+ response = await self._async_post_request(
3242
+ f'{self.api_url}/v1/batch/scrape',
3243
+ params_dict,
3244
+ headers
3245
+ )
3246
+
3247
+ if response.get('success'):
3248
+ try:
3249
+ id = response.get('id')
3250
+ except:
3251
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3252
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3253
+ else:
3254
+ self._handle_error(response, 'start batch scrape job')
3255
+
3256
+
3257
+ async def async_batch_scrape_urls(
3258
+ self,
3259
+ urls: List[str],
3260
+ *,
3261
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3262
+ headers: Optional[Dict[str, str]] = None,
3263
+ include_tags: Optional[List[str]] = None,
3264
+ exclude_tags: Optional[List[str]] = None,
3265
+ only_main_content: Optional[bool] = None,
3266
+ wait_for: Optional[int] = None,
3267
+ timeout: Optional[int] = 30000,
3268
+ location: Optional[LocationConfig] = None,
3269
+ mobile: Optional[bool] = None,
3270
+ skip_tls_verification: Optional[bool] = None,
3271
+ remove_base64_images: Optional[bool] = None,
3272
+ block_ads: Optional[bool] = None,
3273
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3274
+ extract: Optional[JsonConfig] = None,
3275
+ json_options: Optional[JsonConfig] = None,
3276
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3277
+ agent: Optional[AgentOptions] = None,
3278
+ zero_data_retention: Optional[bool] = None,
3279
+ idempotency_key: Optional[str] = None,
3280
+ **kwargs
3281
+ ) -> BatchScrapeResponse:
3282
+ """
3283
+ Initiate a batch scrape job asynchronously.
3284
+
3285
+ Args:
3286
+ urls (List[str]): URLs to scrape
3287
+ formats (Optional[List[Literal]]): Content formats to retrieve
3288
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3289
+ include_tags (Optional[List[str]]): HTML tags to include
3290
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3291
+ only_main_content (Optional[bool]): Extract main content only
3292
+ wait_for (Optional[int]): Wait time in milliseconds
3293
+ timeout (Optional[int]): Request timeout in milliseconds
3294
+ location (Optional[LocationConfig]): Location configuration
3295
+ mobile (Optional[bool]): Use mobile user agent
3296
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3297
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3298
+ block_ads (Optional[bool]): Block advertisements
3299
+ proxy (Optional[Literal]): Proxy type to use
3300
+ extract (Optional[JsonConfig]): Content extraction config
3301
+ json_options (Optional[JsonConfig]): JSON extraction config
3302
+ actions (Optional[List[Union]]): Actions to perform
3303
+ agent (Optional[AgentOptions]): Agent configuration
3304
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
3305
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3306
+ **kwargs: Additional parameters to pass to the API
3307
+
3308
+ Returns:
3309
+ BatchScrapeResponse with:
3310
+ * success - Whether job started successfully
3311
+ * id - Unique identifier for the job
3312
+ * url - Status check URL
3313
+ * error - Error message if start failed
3314
+
3315
+ Raises:
3316
+ Exception: If job initiation fails
3317
+ """
3318
+ # Validate any additional kwargs
3319
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3320
+
3321
+ scrape_params = {}
3322
+
3323
+ # Add individual parameters
3324
+ if formats is not None:
3325
+ scrape_params['formats'] = formats
3326
+ if headers is not None:
3327
+ scrape_params['headers'] = headers
3328
+ if include_tags is not None:
3329
+ scrape_params['includeTags'] = include_tags
3330
+ if exclude_tags is not None:
3331
+ scrape_params['excludeTags'] = exclude_tags
3332
+ if only_main_content is not None:
3333
+ scrape_params['onlyMainContent'] = only_main_content
3334
+ if wait_for is not None:
3335
+ scrape_params['waitFor'] = wait_for
3336
+ if timeout is not None:
3337
+ scrape_params['timeout'] = timeout
3338
+ if location is not None:
3339
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
3340
+ if mobile is not None:
3341
+ scrape_params['mobile'] = mobile
3342
+ if skip_tls_verification is not None:
3343
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3344
+ if remove_base64_images is not None:
3345
+ scrape_params['removeBase64Images'] = remove_base64_images
3346
+ if block_ads is not None:
3347
+ scrape_params['blockAds'] = block_ads
3348
+ if proxy is not None:
3349
+ scrape_params['proxy'] = proxy
3350
+ if extract is not None:
3351
+ extract = self._ensure_schema_dict(extract)
3352
+ if isinstance(extract, dict) and "schema" in extract:
3353
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3354
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
3355
+ if json_options is not None:
3356
+ json_options = self._ensure_schema_dict(json_options)
3357
+ if isinstance(json_options, dict) and "schema" in json_options:
3358
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3359
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
3360
+ if actions:
3361
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
3362
+ if agent is not None:
3363
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
3364
+ if zero_data_retention is not None:
3365
+ scrape_params['zeroDataRetention'] = zero_data_retention
3366
+
3367
+ # Add any additional kwargs
3368
+ scrape_params.update(kwargs)
3369
+
3370
+ # Create final params object
3371
+ final_params = ScrapeParams(**scrape_params)
3372
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3373
+ params_dict['urls'] = urls
3374
+ params_dict['origin'] = f"python-sdk@{version}"
3375
+
3376
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3377
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3378
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3379
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3380
+
3381
+ # Make request
3382
+ headers = self._prepare_headers(idempotency_key)
3383
+ response = await self._async_post_request(
3384
+ f'{self.api_url}/v1/batch/scrape',
3385
+ params_dict,
3386
+ headers
3387
+ )
3388
+
3389
+ if response.get('status_code') == 200:
3390
+ try:
3391
+ return BatchScrapeResponse(**response.json())
3392
+ except:
3393
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3394
+ else:
3395
+ await self._handle_error(response, 'start batch scrape job')
3396
+
3397
+ async def crawl_url(
3398
+ self,
3399
+ url: str,
3400
+ *,
3401
+ include_paths: Optional[List[str]] = None,
3402
+ exclude_paths: Optional[List[str]] = None,
3403
+ max_depth: Optional[int] = None,
3404
+ max_discovery_depth: Optional[int] = None,
3405
+ limit: Optional[int] = None,
3406
+ allow_backward_links: Optional[bool] = None,
3407
+ crawl_entire_domain: Optional[bool] = None,
3408
+ allow_external_links: Optional[bool] = None,
3409
+ ignore_sitemap: Optional[bool] = None,
3410
+ scrape_options: Optional[ScrapeOptions] = None,
3411
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3412
+ deduplicate_similar_urls: Optional[bool] = None,
3413
+ ignore_query_parameters: Optional[bool] = None,
3414
+ regex_on_full_url: Optional[bool] = None,
3415
+ delay: Optional[int] = None,
3416
+ allow_subdomains: Optional[bool] = None,
3417
+ poll_interval: Optional[int] = 2,
3418
+ idempotency_key: Optional[str] = None,
3419
+ **kwargs
3420
+ ) -> CrawlStatusResponse:
3421
+ """
3422
+ Crawl a website starting from a URL.
3423
+
3424
+ Args:
3425
+ url (str): Target URL to start crawling from
3426
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3427
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3428
+ max_depth (Optional[int]): Maximum crawl depth
3429
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3430
+ limit (Optional[int]): Maximum pages to crawl
3431
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3432
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3433
+ allow_external_links (Optional[bool]): Follow external domain links
3434
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3435
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3436
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3437
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3438
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3439
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3440
+ delay (Optional[int]): Delay in seconds between scrapes
3441
+ allow_subdomains (Optional[bool]): Follow subdomains
3442
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3443
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3444
+ **kwargs: Additional parameters to pass to the API
3445
+
3446
+ Returns:
3447
+ CrawlStatusResponse with:
3448
+ * Crawling status and progress
3449
+ * Crawled page contents
3450
+ * Success/error information
3451
+
3452
+ Raises:
3453
+ Exception: If crawl fails
3454
+ """
3455
+ # Validate any additional kwargs
3456
+ self._validate_kwargs(kwargs, "crawl_url")
3457
+
3458
+ crawl_params = {}
3459
+
3460
+ # Add individual parameters
3461
+ if include_paths is not None:
3462
+ crawl_params['includePaths'] = include_paths
3463
+ if exclude_paths is not None:
3464
+ crawl_params['excludePaths'] = exclude_paths
3465
+ if max_depth is not None:
3466
+ crawl_params['maxDepth'] = max_depth
3467
+ if max_discovery_depth is not None:
3468
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3469
+ if limit is not None:
3470
+ crawl_params['limit'] = limit
3471
+ if crawl_entire_domain is not None:
3472
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3473
+ elif allow_backward_links is not None:
3474
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3475
+ if allow_external_links is not None:
3476
+ crawl_params['allowExternalLinks'] = allow_external_links
3477
+ if ignore_sitemap is not None:
3478
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3479
+ if scrape_options is not None:
3480
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
3481
+ if webhook is not None:
3482
+ crawl_params['webhook'] = webhook
3483
+ if deduplicate_similar_urls is not None:
3484
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3485
+ if ignore_query_parameters is not None:
3486
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3487
+ if regex_on_full_url is not None:
3488
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3489
+ if delay is not None:
3490
+ crawl_params['delay'] = delay
3491
+ if allow_subdomains is not None:
3492
+ crawl_params['allowSubdomains'] = allow_subdomains
3493
+
3494
+ # Add any additional kwargs
3495
+ crawl_params.update(kwargs)
3496
+
3497
+ # Create final params object
3498
+ final_params = CrawlParams(**crawl_params)
3499
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3500
+ params_dict['url'] = url
3501
+ params_dict['origin'] = f"python-sdk@{version}"
3502
+ # Make request
3503
+ headers = self._prepare_headers(idempotency_key)
3504
+ response = await self._async_post_request(
3505
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3506
+
3507
+ if response.get('success'):
3508
+ try:
3509
+ id = response.get('id')
3510
+ except:
3511
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3512
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3513
+ else:
3514
+ await self._handle_error(response, 'start crawl job')
3515
+
3516
+
3517
+ async def async_crawl_url(
3518
+ self,
3519
+ url: str,
3520
+ *,
3521
+ include_paths: Optional[List[str]] = None,
3522
+ exclude_paths: Optional[List[str]] = None,
3523
+ max_depth: Optional[int] = None,
3524
+ max_discovery_depth: Optional[int] = None,
3525
+ limit: Optional[int] = None,
3526
+ allow_backward_links: Optional[bool] = None,
3527
+ crawl_entire_domain: Optional[bool] = None,
3528
+ allow_external_links: Optional[bool] = None,
3529
+ ignore_sitemap: Optional[bool] = None,
3530
+ scrape_options: Optional[ScrapeOptions] = None,
3531
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3532
+ deduplicate_similar_urls: Optional[bool] = None,
3533
+ ignore_query_parameters: Optional[bool] = None,
3534
+ regex_on_full_url: Optional[bool] = None,
3535
+ delay: Optional[int] = None,
3536
+ allow_subdomains: Optional[bool] = None,
3537
+ poll_interval: Optional[int] = 2,
3538
+ idempotency_key: Optional[str] = None,
3539
+ **kwargs
3540
+ ) -> CrawlResponse:
3541
+ """
3542
+ Start an asynchronous crawl job.
3543
+
3544
+ Args:
3545
+ url (str): Target URL to start crawling from
3546
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3547
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3548
+ max_depth (Optional[int]): Maximum crawl depth
3549
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3550
+ limit (Optional[int]): Maximum pages to crawl
3551
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3552
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3553
+ allow_external_links (Optional[bool]): Follow external domain links
3554
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3555
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3556
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3557
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3558
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3559
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3560
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3561
+ **kwargs: Additional parameters to pass to the API
3562
+
3563
+ Returns:
3564
+ CrawlResponse with:
3565
+ * success - Whether crawl started successfully
3566
+ * id - Unique identifier for the crawl job
3567
+ * url - Status check URL for the crawl
3568
+ * error - Error message if start failed
3569
+
3570
+ Raises:
3571
+ Exception: If crawl initiation fails
3572
+ """
3573
+ crawl_params = {}
3574
+
3575
+ # Add individual parameters
3576
+ if include_paths is not None:
3577
+ crawl_params['includePaths'] = include_paths
3578
+ if exclude_paths is not None:
3579
+ crawl_params['excludePaths'] = exclude_paths
3580
+ if max_depth is not None:
3581
+ crawl_params['maxDepth'] = max_depth
3582
+ if max_discovery_depth is not None:
3583
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3584
+ if limit is not None:
3585
+ crawl_params['limit'] = limit
3586
+ if crawl_entire_domain is not None:
3587
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3588
+ elif allow_backward_links is not None:
3589
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3590
+ if allow_external_links is not None:
3591
+ crawl_params['allowExternalLinks'] = allow_external_links
3592
+ if ignore_sitemap is not None:
3593
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3594
+ if scrape_options is not None:
3595
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
3596
+ if webhook is not None:
3597
+ crawl_params['webhook'] = webhook
3598
+ if deduplicate_similar_urls is not None:
3599
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3600
+ if ignore_query_parameters is not None:
3601
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3602
+ if regex_on_full_url is not None:
3603
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3604
+ if delay is not None:
3605
+ crawl_params['delay'] = delay
3606
+ if allow_subdomains is not None:
3607
+ crawl_params['allowSubdomains'] = allow_subdomains
3608
+
3609
+ # Add any additional kwargs
3610
+ crawl_params.update(kwargs)
3611
+
3612
+ # Create final params object
3613
+ final_params = CrawlParams(**crawl_params)
3614
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3615
+ params_dict['url'] = url
3616
+ params_dict['origin'] = f"python-sdk@{version}"
3617
+
3618
+ # Make request
3619
+ headers = self._prepare_headers(idempotency_key)
3620
+ response = await self._async_post_request(
3621
+ f'{self.api_url}/v1/crawl',
3622
+ params_dict,
3623
+ headers
3624
+ )
3625
+
3626
+ if response.get('success'):
3627
+ try:
3628
+ return CrawlResponse(**response)
3629
+ except:
3630
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3631
+ else:
3632
+ await self._handle_error(response, 'start crawl job')
3633
+
3634
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3635
+ """
3636
+ Check the status and results of an asynchronous crawl job.
3637
+
3638
+ Args:
3639
+ id (str): Unique identifier for the crawl job
3640
+
3641
+ Returns:
3642
+ CrawlStatusResponse containing:
3643
+ Status Information:
3644
+ * status - Current state (scraping/completed/failed/cancelled)
3645
+ * completed - Number of pages crawled
3646
+ * total - Total pages to crawl
3647
+ * creditsUsed - API credits consumed
3648
+ * expiresAt - Data expiration timestamp
3649
+
3650
+ Results:
3651
+ * data - List of crawled documents
3652
+ * next - URL for next page of results (if paginated)
3653
+ * success - Whether status check succeeded
3654
+ * error - Error message if failed
3655
+
3656
+ Raises:
3657
+ Exception: If status check fails
3658
+ """
3659
+ headers = self._prepare_headers()
3660
+ endpoint = f'/v1/crawl/{id}'
3661
+
3662
+ status_data = await self._async_get_request(
3663
+ f'{self.api_url}{endpoint}',
3664
+ headers
3665
+ )
3666
+
3667
+ if status_data.get('status') == 'completed':
3668
+ if 'data' in status_data:
3669
+ data = status_data['data']
3670
+ while 'next' in status_data:
3671
+ if len(status_data['data']) == 0:
3672
+ break
3673
+ next_url = status_data.get('next')
3674
+ if not next_url:
3675
+ logger.warning("Expected 'next' URL is missing.")
3676
+ break
3677
+ next_data = await self._async_get_request(next_url, headers)
3678
+ data.extend(next_data.get('data', []))
3679
+ status_data = next_data
3680
+ status_data['data'] = data
3681
+ # Create CrawlStatusResponse object from status data
3682
+ response = CrawlStatusResponse(
3683
+ status=status_data.get('status'),
3684
+ total=status_data.get('total'),
3685
+ completed=status_data.get('completed'),
3686
+ creditsUsed=status_data.get('creditsUsed'),
3687
+ expiresAt=status_data.get('expiresAt'),
3688
+ data=status_data.get('data'),
3689
+ success=False if 'error' in status_data else True
3690
+ )
3691
+
3692
+ if 'error' in status_data:
3693
+ response.error = status_data.get('error')
3694
+
3695
+ if 'next' in status_data:
3696
+ response.next = status_data.get('next')
3697
+
3698
+ return response
3699
+
3700
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3701
+ """
3702
+ Monitor the status of an asynchronous job until completion.
3703
+
3704
+ Args:
3705
+ id (str): The ID of the job to monitor
3706
+ headers (Dict[str, str]): Headers to include in status check requests
3707
+ poll_interval (int): Seconds between status checks (default: 2)
3708
+
3709
+ Returns:
3710
+ CrawlStatusResponse: The job results if completed successfully
3711
+
3712
+ Raises:
3713
+ Exception: If the job fails or an error occurs during status checks
3714
+ """
3715
+ while True:
3716
+ status_data = await self._async_get_request(
3717
+ f'{self.api_url}/v1/crawl/{id}',
3718
+ headers
3719
+ )
3720
+
3721
+ if status_data.get('status') == 'completed':
3722
+ if 'data' in status_data:
3723
+ data = status_data['data']
3724
+ while 'next' in status_data:
3725
+ if len(status_data['data']) == 0:
3726
+ break
3727
+ next_url = status_data.get('next')
3728
+ if not next_url:
3729
+ logger.warning("Expected 'next' URL is missing.")
3730
+ break
3731
+ next_data = await self._async_get_request(next_url, headers)
3732
+ data.extend(next_data.get('data', []))
3733
+ status_data = next_data
3734
+ status_data['data'] = data
3735
+ return CrawlStatusResponse(**status_data)
3736
+ else:
3737
+ raise Exception('Job completed but no data was returned')
3738
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3739
+ await asyncio.sleep(max(poll_interval, 2))
3740
+ else:
3741
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3742
+
3743
+ async def map_url(
3744
+ self,
3745
+ url: str,
3746
+ *,
3747
+ search: Optional[str] = None,
3748
+ ignore_sitemap: Optional[bool] = None,
3749
+ include_subdomains: Optional[bool] = None,
3750
+ sitemap_only: Optional[bool] = None,
3751
+ limit: Optional[int] = None,
3752
+ timeout: Optional[int] = 30000,
3753
+ params: Optional[MapParams] = None) -> MapResponse:
3754
+ """
3755
+ Asynchronously map and discover links from a URL.
3756
+
3757
+ Args:
3758
+ url (str): Target URL to map
3759
+ params (Optional[MapParams]): See MapParams model:
3760
+ Discovery Options:
3761
+ * search - Filter pattern for URLs
3762
+ * ignoreSitemap - Skip sitemap.xml
3763
+ * includeSubdomains - Include subdomain links
3764
+ * sitemapOnly - Only use sitemap.xml
3765
+
3766
+ Limits:
3767
+ * limit - Max URLs to return
3768
+ * timeout - Request timeout (ms)
3769
+
3770
+ Returns:
3771
+ MapResponse with:
3772
+ * Discovered URLs
3773
+ * Success/error status
3774
+
3775
+ Raises:
3776
+ Exception: If mapping fails
3777
+ """
3778
+ map_params = {}
3779
+ if params:
3780
+ map_params.update(params.dict(by_alias=True, exclude_none=True))
3781
+
3782
+ # Add individual parameters
3783
+ if search is not None:
3784
+ map_params['search'] = search
3785
+ if ignore_sitemap is not None:
3786
+ map_params['ignoreSitemap'] = ignore_sitemap
3787
+ if include_subdomains is not None:
3788
+ map_params['includeSubdomains'] = include_subdomains
3789
+ if sitemap_only is not None:
3790
+ map_params['sitemapOnly'] = sitemap_only
3791
+ if limit is not None:
3792
+ map_params['limit'] = limit
3793
+ if timeout is not None:
3794
+ map_params['timeout'] = timeout
3795
+
3796
+ # Create final params object
3797
+ final_params = MapParams(**map_params)
3798
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3799
+ params_dict['url'] = url
3800
+ params_dict['origin'] = f"python-sdk@{version}"
3801
+
3802
+ # Make request
3803
+ endpoint = f'/v1/map'
3804
+ response = await self._async_post_request(
3805
+ f'{self.api_url}{endpoint}',
3806
+ params_dict,
3807
+ headers={"Authorization": f"Bearer {self.api_key}"}
3808
+ )
3809
+
3810
+ if response.get('success') and 'links' in response:
3811
+ return MapResponse(**response)
3812
+ elif 'error' in response:
3813
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3814
+ else:
3815
+ raise Exception(f'Failed to map URL. Error: {response}')
3816
+
3817
+ async def extract(
3818
+ self,
3819
+ urls: Optional[List[str]] = None,
3820
+ *,
3821
+ prompt: Optional[str] = None,
3822
+ schema: Optional[Any] = None,
3823
+ system_prompt: Optional[str] = None,
3824
+ allow_external_links: Optional[bool] = False,
3825
+ enable_web_search: Optional[bool] = False,
3826
+ show_sources: Optional[bool] = False,
3827
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3828
+
3829
+ """
3830
+ Asynchronously extract structured information from URLs.
3831
+
3832
+ Args:
3833
+ urls (Optional[List[str]]): URLs to extract from
3834
+ prompt (Optional[str]): Custom extraction prompt
3835
+ schema (Optional[Any]): JSON schema/Pydantic model
3836
+ system_prompt (Optional[str]): System context
3837
+ allow_external_links (Optional[bool]): Follow external links
3838
+ enable_web_search (Optional[bool]): Enable web search
3839
+ show_sources (Optional[bool]): Include source URLs
3840
+ agent (Optional[Dict[str, Any]]): Agent configuration
3841
+
3842
+ Returns:
3843
+ ExtractResponse with:
3844
+ * Structured data matching schema
3845
+ * Source information if requested
3846
+ * Success/error status
3847
+
3848
+ Raises:
3849
+ ValueError: If prompt/schema missing or extraction fails
3850
+ """
3851
+ headers = self._prepare_headers()
3852
+
3853
+ if not prompt and not schema:
3854
+ raise ValueError("Either prompt or schema is required")
3855
+
3856
+ if not urls and not prompt:
3857
+ raise ValueError("Either urls or prompt is required")
3858
+
3859
+ if schema:
3860
+ schema = self._ensure_schema_dict(schema)
3861
+
3862
+ request_data = {
3863
+ 'urls': urls or [],
3864
+ 'allowExternalLinks': allow_external_links,
3865
+ 'enableWebSearch': enable_web_search,
3866
+ 'showSources': show_sources,
3867
+ 'schema': schema,
3868
+ 'origin': f'python-sdk@{get_version()}'
3869
+ }
3870
+
3871
+ # Only add prompt and systemPrompt if they exist
3872
+ if prompt:
3873
+ request_data['prompt'] = prompt
3874
+ if system_prompt:
3875
+ request_data['systemPrompt'] = system_prompt
3876
+
3877
+ if agent:
3878
+ request_data['agent'] = agent
3879
+
3880
+ response = await self._async_post_request(
3881
+ f'{self.api_url}/v1/extract',
3882
+ request_data,
3883
+ headers
3884
+ )
3885
+
3886
+ if response.get('success'):
3887
+ job_id = response.get('id')
3888
+ if not job_id:
3889
+ raise Exception('Job ID not returned from extract request.')
3890
+
3891
+ while True:
3892
+ status_data = await self._async_get_request(
3893
+ f'{self.api_url}/v1/extract/{job_id}',
3894
+ headers
3895
+ )
3896
+
3897
+ if status_data['status'] == 'completed':
3898
+ return ExtractResponse(**status_data)
3899
+ elif status_data['status'] in ['failed', 'cancelled']:
3900
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3901
+
3902
+ await asyncio.sleep(2)
3903
+ else:
3904
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3905
+
3906
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3907
+ """
3908
+ Check the status of an asynchronous batch scrape job.
3909
+
3910
+ Args:
3911
+ id (str): The ID of the batch scrape job
3912
+
3913
+ Returns:
3914
+ BatchScrapeStatusResponse containing:
3915
+ Status Information:
3916
+ * status - Current state (scraping/completed/failed/cancelled)
3917
+ * completed - Number of URLs scraped
3918
+ * total - Total URLs to scrape
3919
+ * creditsUsed - API credits consumed
3920
+ * expiresAt - Data expiration timestamp
3921
+
3922
+ Results:
3923
+ * data - List of scraped documents
3924
+ * next - URL for next page of results (if paginated)
3925
+ * success - Whether status check succeeded
3926
+ * error - Error message if failed
3927
+
3928
+ Raises:
3929
+ Exception: If status check fails
3930
+ """
3931
+ headers = self._prepare_headers()
3932
+ endpoint = f'/v1/batch/scrape/{id}'
3933
+
3934
+ status_data = await self._async_get_request(
3935
+ f'{self.api_url}{endpoint}',
3936
+ headers
3937
+ )
3938
+
3939
+ if status_data['status'] == 'completed':
3940
+ if 'data' in status_data:
3941
+ data = status_data['data']
3942
+ while 'next' in status_data:
3943
+ if len(status_data['data']) == 0:
3944
+ break
3945
+ next_url = status_data.get('next')
3946
+ if not next_url:
3947
+ logger.warning("Expected 'next' URL is missing.")
3948
+ break
3949
+ next_data = await self._async_get_request(next_url, headers)
3950
+ data.extend(next_data.get('data', []))
3951
+ status_data = next_data
3952
+ status_data['data'] = data
3953
+
3954
+ response = BatchScrapeStatusResponse(
3955
+ status=status_data.get('status'),
3956
+ total=status_data.get('total'),
3957
+ completed=status_data.get('completed'),
3958
+ creditsUsed=status_data.get('creditsUsed'),
3959
+ expiresAt=status_data.get('expiresAt'),
3960
+ data=status_data.get('data')
3961
+ )
3962
+
3963
+ if 'error' in status_data:
3964
+ response['error'] = status_data['error']
3965
+
3966
+ if 'next' in status_data:
3967
+ response['next'] = status_data['next']
3968
+
3969
+ return {
3970
+ 'success': False if 'error' in status_data else True,
3971
+ **response
3972
+ }
3973
+
3974
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3975
+ """
3976
+ Get information about errors from an asynchronous batch scrape job.
3977
+
3978
+ Args:
3979
+ id (str): The ID of the batch scrape job
3980
+
3981
+ Returns:
3982
+ CrawlErrorsResponse containing:
3983
+ errors (List[Dict[str, str]]): List of errors with fields:
3984
+ * id (str): Error ID
3985
+ * timestamp (str): When the error occurred
3986
+ * url (str): URL that caused the error
3987
+ * error (str): Error message
3988
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3989
+
3990
+ Raises:
3991
+ Exception: If error check fails
3992
+ """
3993
+ headers = self._prepare_headers()
3994
+ return await self._async_get_request(
3995
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3996
+ headers
3997
+ )
3998
+
3999
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
4000
+ """
4001
+ Get information about errors from an asynchronous crawl job.
4002
+
4003
+ Args:
4004
+ id (str): The ID of the crawl job
4005
+
4006
+ Returns:
4007
+ CrawlErrorsResponse containing:
4008
+ * errors (List[Dict[str, str]]): List of errors with fields:
4009
+ - id (str): Error ID
4010
+ - timestamp (str): When the error occurred
4011
+ - url (str): URL that caused the error
4012
+ - error (str): Error message
4013
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
4014
+
4015
+ Raises:
4016
+ Exception: If error check fails
4017
+ """
4018
+ headers = self._prepare_headers()
4019
+ return await self._async_get_request(
4020
+ f'{self.api_url}/v1/crawl/{id}/errors',
4021
+ headers
4022
+ )
4023
+
4024
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
4025
+ """
4026
+ Cancel an asynchronous crawl job.
4027
+
4028
+ Args:
4029
+ id (str): The ID of the crawl job to cancel
4030
+
4031
+ Returns:
4032
+ Dict[str, Any] containing:
4033
+ * success (bool): Whether cancellation was successful
4034
+ * error (str, optional): Error message if cancellation failed
4035
+
4036
+ Raises:
4037
+ Exception: If cancellation fails
4038
+ """
4039
+ headers = self._prepare_headers()
4040
+ async with aiohttp.ClientSession() as session:
4041
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
4042
+ return await response.json()
4043
+
4044
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
4045
+ """
4046
+ Check the status of an asynchronous extraction job.
4047
+
4048
+ Args:
4049
+ job_id (str): The ID of the extraction job
4050
+
4051
+ Returns:
4052
+ ExtractResponse[Any] with:
4053
+ * success (bool): Whether request succeeded
4054
+ * data (Optional[Any]): Extracted data matching schema
4055
+ * error (Optional[str]): Error message if any
4056
+ * warning (Optional[str]): Warning message if any
4057
+ * sources (Optional[List[str]]): Source URLs if requested
4058
+
4059
+ Raises:
4060
+ ValueError: If status check fails
4061
+ """
4062
+ headers = self._prepare_headers()
4063
+ try:
4064
+ return await self._async_get_request(
4065
+ f'{self.api_url}/v1/extract/{job_id}',
4066
+ headers
4067
+ )
4068
+ except Exception as e:
4069
+ raise ValueError(str(e))
4070
+
4071
+ async def async_extract(
4072
+ self,
4073
+ urls: Optional[List[str]] = None,
4074
+ *,
4075
+ prompt: Optional[str] = None,
4076
+ schema: Optional[Any] = None,
4077
+ system_prompt: Optional[str] = None,
4078
+ allow_external_links: Optional[bool] = False,
4079
+ enable_web_search: Optional[bool] = False,
4080
+ show_sources: Optional[bool] = False,
4081
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
4082
+ """
4083
+ Initiate an asynchronous extraction job without waiting for completion.
4084
+
4085
+ Args:
4086
+ urls (Optional[List[str]]): URLs to extract from
4087
+ prompt (Optional[str]): Custom extraction prompt
4088
+ schema (Optional[Any]): JSON schema/Pydantic model
4089
+ system_prompt (Optional[str]): System context
4090
+ allow_external_links (Optional[bool]): Follow external links
4091
+ enable_web_search (Optional[bool]): Enable web search
4092
+ show_sources (Optional[bool]): Include source URLs
4093
+ agent (Optional[Dict[str, Any]]): Agent configuration
4094
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
4095
+
4096
+ Returns:
4097
+ ExtractResponse[Any] with:
4098
+ * success (bool): Whether request succeeded
4099
+ * data (Optional[Any]): Extracted data matching schema
4100
+ * error (Optional[str]): Error message if any
4101
+
4102
+ Raises:
4103
+ ValueError: If job initiation fails
4104
+ """
4105
+ headers = self._prepare_headers()
4106
+
4107
+ if not prompt and not schema:
4108
+ raise ValueError("Either prompt or schema is required")
4109
+
4110
+ if not urls and not prompt:
4111
+ raise ValueError("Either urls or prompt is required")
4112
+
4113
+ if schema:
4114
+ schema = self._ensure_schema_dict(schema)
4115
+
4116
+ request_data = ExtractResponse(
4117
+ urls=urls or [],
4118
+ allowExternalLinks=allow_external_links,
4119
+ enableWebSearch=enable_web_search,
4120
+ showSources=show_sources,
4121
+ schema=schema,
4122
+ origin=f'python-sdk@{version}'
4123
+ )
4124
+
4125
+ if prompt:
4126
+ request_data['prompt'] = prompt
4127
+ if system_prompt:
4128
+ request_data['systemPrompt'] = system_prompt
4129
+ if agent:
4130
+ request_data['agent'] = agent
4131
+
4132
+ try:
4133
+ return await self._async_post_request(
4134
+ f'{self.api_url}/v1/extract',
4135
+ request_data,
4136
+ headers
4137
+ )
4138
+ except Exception as e:
4139
+ raise ValueError(str(e))
4140
+
4141
+ async def generate_llms_text(
4142
+ self,
4143
+ url: str,
4144
+ *,
4145
+ max_urls: Optional[int] = None,
4146
+ show_full_text: Optional[bool] = None,
4147
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
4148
+ """
4149
+ Generate LLMs.txt for a given URL and monitor until completion.
4150
+
4151
+ Args:
4152
+ url (str): Target URL to generate LLMs.txt from
4153
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4154
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4155
+ experimental_stream (Optional[bool]): Enable experimental streaming
4156
+
4157
+ Returns:
4158
+ GenerateLLMsTextStatusResponse containing:
4159
+ * success (bool): Whether generation completed successfully
4160
+ * status (str): Status of generation (processing/completed/failed)
4161
+ * data (Dict[str, str], optional): Generated text with fields:
4162
+ - llmstxt (str): Generated LLMs.txt content
4163
+ - llmsfulltxt (str, optional): Full version if requested
4164
+ * error (str, optional): Error message if generation failed
4165
+ * expiresAt (str): When the generated data expires
4166
+
4167
+ Raises:
4168
+ Exception: If generation fails
4169
+ """
4170
+ params = {}
4171
+ if max_urls is not None:
4172
+ params['maxUrls'] = max_urls
4173
+ if show_full_text is not None:
4174
+ params['showFullText'] = show_full_text
4175
+ if experimental_stream is not None:
4176
+ params['__experimental_stream'] = experimental_stream
4177
+
4178
+ response = await self.async_generate_llms_text(
4179
+ url,
4180
+ max_urls=max_urls,
4181
+ show_full_text=show_full_text,
4182
+ experimental_stream=experimental_stream
4183
+ )
4184
+ if not response.get('success') or 'id' not in response:
4185
+ return response
4186
+
4187
+ job_id = response['id']
4188
+ while True:
4189
+ status = await self.check_generate_llms_text_status(job_id)
4190
+
4191
+ if status['status'] == 'completed':
4192
+ return status
4193
+ elif status['status'] == 'failed':
4194
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4195
+ elif status['status'] != 'processing':
4196
+ break
4197
+
4198
+ await asyncio.sleep(2)
4199
+
4200
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4201
+
4202
+ async def async_generate_llms_text(
4203
+ self,
4204
+ url: str,
4205
+ *,
4206
+ max_urls: Optional[int] = None,
4207
+ show_full_text: Optional[bool] = None,
4208
+ cache: Optional[bool] = None,
4209
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4210
+ """
4211
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4212
+
4213
+ Args:
4214
+ url (str): Target URL to generate LLMs.txt from
4215
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4216
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4217
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
4218
+ experimental_stream (Optional[bool]): Enable experimental streaming
4219
+
4220
+ Returns:
4221
+ GenerateLLMsTextResponse containing:
4222
+ * success (bool): Whether job started successfully
4223
+ * id (str): Unique identifier for the job
4224
+ * error (str, optional): Error message if start failed
4225
+
4226
+ Raises:
4227
+ ValueError: If job initiation fails
4228
+ """
4229
+ params = {}
4230
+ if max_urls is not None:
4231
+ params['maxUrls'] = max_urls
4232
+ if show_full_text is not None:
4233
+ params['showFullText'] = show_full_text
4234
+ if experimental_stream is not None:
4235
+ params['__experimental_stream'] = experimental_stream
4236
+
4237
+ params = GenerateLLMsTextParams(
4238
+ maxUrls=max_urls,
4239
+ showFullText=show_full_text,
4240
+ cache=cache,
4241
+ __experimental_stream=experimental_stream
4242
+ )
4243
+
4244
+ headers = self._prepare_headers()
4245
+ json_data = {'url': url, **params.dict(by_alias=True, exclude_none=True)}
4246
+ json_data['origin'] = f"python-sdk@{version}"
4247
+
4248
+ try:
4249
+ return await self._async_post_request(
4250
+ f'{self.api_url}/v1/llmstxt',
4251
+ json_data,
4252
+ headers
4253
+ )
4254
+ except Exception as e:
4255
+ raise ValueError(str(e))
4256
+
4257
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4258
+ """
4259
+ Check the status of an asynchronous LLMs.txt generation job.
4260
+
4261
+ Args:
4262
+ id (str): The ID of the generation job
4263
+
4264
+ Returns:
4265
+ GenerateLLMsTextStatusResponse containing:
4266
+ * success (bool): Whether generation completed successfully
4267
+ * status (str): Status of generation (processing/completed/failed)
4268
+ * data (Dict[str, str], optional): Generated text with fields:
4269
+ - llmstxt (str): Generated LLMs.txt content
4270
+ - llmsfulltxt (str, optional): Full version if requested
4271
+ * error (str, optional): Error message if generation failed
4272
+ * expiresAt (str): When the generated data expires
4273
+
4274
+ Raises:
4275
+ ValueError: If status check fails
4276
+ """
4277
+ headers = self._prepare_headers()
4278
+ try:
4279
+ return await self._async_get_request(
4280
+ f'{self.api_url}/v1/llmstxt/{id}',
4281
+ headers
4282
+ )
4283
+ except Exception as e:
4284
+ raise ValueError(str(e))
4285
+
4286
+ async def deep_research(
4287
+ self,
4288
+ query: str,
4289
+ *,
4290
+ max_depth: Optional[int] = None,
4291
+ time_limit: Optional[int] = None,
4292
+ max_urls: Optional[int] = None,
4293
+ analysis_prompt: Optional[str] = None,
4294
+ system_prompt: Optional[str] = None,
4295
+ __experimental_stream_steps: Optional[bool] = None,
4296
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4297
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4298
+ """
4299
+ Initiates a deep research operation on a given query and polls until completion.
4300
+
4301
+ Args:
4302
+ query (str): Research query or topic to investigate
4303
+ max_depth (Optional[int]): Maximum depth of research exploration
4304
+ time_limit (Optional[int]): Time limit in seconds for research
4305
+ max_urls (Optional[int]): Maximum number of URLs to process
4306
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4307
+ system_prompt (Optional[str]): Custom system prompt
4308
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4309
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4310
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4311
+
4312
+ Returns:
4313
+ DeepResearchStatusResponse containing:
4314
+ * success (bool): Whether research completed successfully
4315
+ * status (str): Current state (processing/completed/failed)
4316
+ * error (Optional[str]): Error message if failed
4317
+ * id (str): Unique identifier for the research job
4318
+ * data (Any): Research findings and analysis
4319
+ * sources (List[Dict]): List of discovered sources
4320
+ * activities (List[Dict]): Research progress log
4321
+ * summaries (List[str]): Generated research summaries
4322
+
4323
+ Raises:
4324
+ Exception: If research fails
4325
+ """
4326
+ research_params = {}
4327
+ if max_depth is not None:
4328
+ research_params['maxDepth'] = max_depth
4329
+ if time_limit is not None:
4330
+ research_params['timeLimit'] = time_limit
4331
+ if max_urls is not None:
4332
+ research_params['maxUrls'] = max_urls
4333
+ if analysis_prompt is not None:
4334
+ research_params['analysisPrompt'] = analysis_prompt
4335
+ if system_prompt is not None:
4336
+ research_params['systemPrompt'] = system_prompt
4337
+ if __experimental_stream_steps is not None:
4338
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4339
+ research_params = DeepResearchParams(**research_params)
4340
+
4341
+ response = await self.async_deep_research(
4342
+ query,
4343
+ max_depth=max_depth,
4344
+ time_limit=time_limit,
4345
+ max_urls=max_urls,
4346
+ analysis_prompt=analysis_prompt,
4347
+ system_prompt=system_prompt
4348
+ )
4349
+ if not response.get('success') or 'id' not in response:
4350
+ return response
4351
+
4352
+ job_id = response['id']
4353
+ last_activity_count = 0
4354
+ last_source_count = 0
4355
+
4356
+ while True:
4357
+ status = await self.check_deep_research_status(job_id)
4358
+
4359
+ if on_activity and 'activities' in status:
4360
+ new_activities = status['activities'][last_activity_count:]
4361
+ for activity in new_activities:
4362
+ on_activity(activity)
4363
+ last_activity_count = len(status['activities'])
4364
+
4365
+ if on_source and 'sources' in status:
4366
+ new_sources = status['sources'][last_source_count:]
4367
+ for source in new_sources:
4368
+ on_source(source)
4369
+ last_source_count = len(status['sources'])
4370
+
4371
+ if status['status'] == 'completed':
4372
+ return status
4373
+ elif status['status'] == 'failed':
4374
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4375
+ elif status['status'] != 'processing':
4376
+ break
4377
+
4378
+ await asyncio.sleep(2)
4379
+
4380
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4381
+
4382
+ async def async_deep_research(
4383
+ self,
4384
+ query: str,
4385
+ *,
4386
+ max_depth: Optional[int] = None,
4387
+ time_limit: Optional[int] = None,
4388
+ max_urls: Optional[int] = None,
4389
+ analysis_prompt: Optional[str] = None,
4390
+ system_prompt: Optional[str] = None,
4391
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4392
+ """
4393
+ Initiates an asynchronous deep research operation.
4394
+
4395
+ Args:
4396
+ query (str): Research query or topic to investigate
4397
+ max_depth (Optional[int]): Maximum depth of research exploration
4398
+ time_limit (Optional[int]): Time limit in seconds for research
4399
+ max_urls (Optional[int]): Maximum number of URLs to process
4400
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4401
+ system_prompt (Optional[str]): Custom system prompt
4402
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4403
+
4404
+ Returns:
4405
+ Dict[str, Any]: A response containing:
4406
+ * success (bool): Whether the research initiation was successful
4407
+ * id (str): The unique identifier for the research job
4408
+ * error (str, optional): Error message if initiation failed
4409
+
4410
+ Raises:
4411
+ Exception: If the research initiation fails.
4412
+ """
4413
+ research_params = {}
4414
+ if max_depth is not None:
4415
+ research_params['maxDepth'] = max_depth
4416
+ if time_limit is not None:
4417
+ research_params['timeLimit'] = time_limit
4418
+ if max_urls is not None:
4419
+ research_params['maxUrls'] = max_urls
4420
+ if analysis_prompt is not None:
4421
+ research_params['analysisPrompt'] = analysis_prompt
4422
+ if system_prompt is not None:
4423
+ research_params['systemPrompt'] = system_prompt
4424
+ if __experimental_stream_steps is not None:
4425
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4426
+ research_params = DeepResearchParams(**research_params)
4427
+
4428
+ headers = self._prepare_headers()
4429
+
4430
+ json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)}
4431
+ json_data['origin'] = f"python-sdk@{version}"
4432
+
4433
+ try:
4434
+ return await self._async_post_request(
4435
+ f'{self.api_url}/v1/deep-research',
4436
+ json_data,
4437
+ headers
4438
+ )
4439
+ except Exception as e:
4440
+ raise ValueError(str(e))
4441
+
4442
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4443
+ """
4444
+ Check the status of a deep research operation.
4445
+
4446
+ Args:
4447
+ id (str): The ID of the deep research operation.
4448
+
4449
+ Returns:
4450
+ DeepResearchResponse containing:
4451
+
4452
+ Status:
4453
+ * success - Whether research completed successfully
4454
+ * status - Current state (processing/completed/failed)
4455
+ * error - Error message if failed
4456
+
4457
+ Results:
4458
+ * id - Unique identifier for the research job
4459
+ * data - Research findings and analysis
4460
+ * sources - List of discovered sources
4461
+ * activities - Research progress log
4462
+ * summaries - Generated research summaries
4463
+
4464
+ Raises:
4465
+ Exception: If the status check fails.
4466
+ """
4467
+ headers = self._prepare_headers()
4468
+ try:
4469
+ return await self._async_get_request(
4470
+ f'{self.api_url}/v1/deep-research/{id}',
4471
+ headers
4472
+ )
4473
+ except Exception as e:
4474
+ raise ValueError(str(e))
4475
+
4476
+ async def search(
4477
+ self,
4478
+ query: str,
4479
+ *,
4480
+ limit: Optional[int] = None,
4481
+ tbs: Optional[str] = None,
4482
+ filter: Optional[str] = None,
4483
+ lang: Optional[str] = None,
4484
+ country: Optional[str] = None,
4485
+ location: Optional[str] = None,
4486
+ timeout: Optional[int] = 30000,
4487
+ scrape_options: Optional[ScrapeOptions] = None,
4488
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4489
+ **kwargs) -> SearchResponse:
4490
+ """
4491
+ Asynchronously search for content using Firecrawl.
4492
+
4493
+ Args:
4494
+ query (str): Search query string
4495
+ limit (Optional[int]): Max results (default: 5)
4496
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4497
+ filter (Optional[str]): Custom result filter
4498
+ lang (Optional[str]): Language code (default: "en")
4499
+ country (Optional[str]): Country code (default: "us")
4500
+ location (Optional[str]): Geo-targeting
4501
+ timeout (Optional[int]): Request timeout in milliseconds
4502
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4503
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4504
+ **kwargs: Additional keyword arguments for future compatibility
4505
+
4506
+ Returns:
4507
+ SearchResponse: Response containing:
4508
+ * success (bool): Whether request succeeded
4509
+ * data (List[FirecrawlDocument]): Search results
4510
+ * warning (Optional[str]): Warning message if any
4511
+ * error (Optional[str]): Error message if any
4512
+
4513
+ Raises:
4514
+ Exception: If search fails or response cannot be parsed
4515
+ """
4516
+ # Build search parameters
4517
+ search_params = {}
4518
+ if params:
4519
+ if isinstance(params, dict):
4520
+ search_params.update(params)
4521
+ else:
4522
+ search_params.update(params.dict(by_alias=True, exclude_none=True))
4523
+
4524
+ # Add individual parameters
4525
+ if limit is not None:
4526
+ search_params['limit'] = limit
4527
+ if tbs is not None:
4528
+ search_params['tbs'] = tbs
4529
+ if filter is not None:
4530
+ search_params['filter'] = filter
4531
+ if lang is not None:
4532
+ search_params['lang'] = lang
4533
+ if country is not None:
4534
+ search_params['country'] = country
4535
+ if location is not None:
4536
+ search_params['location'] = location
4537
+ if timeout is not None:
4538
+ search_params['timeout'] = timeout
4539
+ if scrape_options is not None:
4540
+ search_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
4541
+
4542
+ # Add any additional kwargs
4543
+ search_params.update(kwargs)
4544
+
4545
+ # Create final params object
4546
+ final_params = SearchParams(query=query, **search_params)
4547
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
4548
+ params_dict['origin'] = f"python-sdk@{version}"
4549
+
4550
+ return await self._async_post_request(
4551
+ f"{self.api_url}/v1/search",
4552
+ params_dict,
4553
+ {"Authorization": f"Bearer {self.api_key}"}
4554
+ )
4555
+
4556
+ class AsyncCrawlWatcher(CrawlWatcher):
4557
+ """
4558
+ Async version of CrawlWatcher that properly handles async operations.
4559
+ """
4560
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4561
+ super().__init__(id, app)
4562
+
4563
+ async def connect(self) -> None:
4564
+ """
4565
+ Establishes async WebSocket connection and starts listening for messages.
4566
+ """
4567
+ async with websockets.connect(
4568
+ self.ws_url,
4569
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4570
+ ) as websocket:
4571
+ await self._listen(websocket)
4572
+
4573
+ async def _listen(self, websocket) -> None:
4574
+ """
4575
+ Listens for incoming WebSocket messages and handles them asynchronously.
4576
+
4577
+ Args:
4578
+ websocket: The WebSocket connection object
4579
+ """
4580
+ async for message in websocket:
4581
+ msg = json.loads(message)
4582
+ await self._handle_message(msg)
4583
+
4584
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4585
+ """
4586
+ Handles incoming WebSocket messages based on their type asynchronously.
4587
+
4588
+ Args:
4589
+ msg (Dict[str, Any]): The message to handle
4590
+ """
4591
+ if msg['type'] == 'done':
4592
+ self.status = 'completed'
4593
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4594
+ elif msg['type'] == 'error':
4595
+ self.status = 'failed'
4596
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4597
+ elif msg['type'] == 'catchup':
4598
+ self.status = msg['data']['status']
4599
+ self.data.extend(msg['data'].get('data', []))
4600
+ for doc in self.data:
4601
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4602
+ elif msg['type'] == 'document':
4603
+ self.data.append(msg['data'])
4604
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4605
+
4606
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4607
+ """
4608
+ Handle errors from async API responses.
4609
+ """
4610
+ try:
4611
+ error_data = await response.json()
4612
+ error_message = error_data.get('error', 'No error message provided.')
4613
+ error_details = error_data.get('details', 'No additional error details provided.')
4614
+ except:
4615
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4616
+
4617
+ # Use the app's method to get the error message
4618
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4619
+
4620
+ raise aiohttp.ClientError(message)
4621
+
4622
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4623
+ """
4624
+ Generate a standardized error message based on HTTP status code for async operations.
4625
+
4626
+ Args:
4627
+ status_code (int): The HTTP status code from the response
4628
+ action (str): Description of the action that was being performed
4629
+ error_message (str): The error message from the API response
4630
+ error_details (str): Additional error details from the API response
4631
+
4632
+ Returns:
4633
+ str: A formatted error message
4634
+ """
4635
+ return self._get_error_message(status_code, action, error_message, error_details)