firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
firecrawl/v1/client.py ADDED
@@ -0,0 +1,4653 @@
1
+ """
2
+ Firecrawl v1 API Client - Legacy Implementation
3
+
4
+ This module provides the legacy v1 implementation of the Firecrawl SDK.
5
+ It contains the complete `V1FirecrawlApp` class with all v1 API methods and types
6
+ for backward compatibility. This is used by the unified client to provide
7
+ version-specific access patterns like app.v1.scrape_url().
8
+
9
+ Classes:
10
+ - V1FirecrawlApp: Legacy v1 client for interacting with the Firecrawl API.
11
+ - AsyncV1FirecrawlApp: Async version of the v1 client.
12
+ - CrawlWatcher: WebSocket-based crawl monitoring for v1.
13
+ """
14
+ import logging
15
+ import os
16
+ import time
17
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
18
+ import json
19
+ from datetime import datetime
20
+ import re
21
+ import requests
22
+ import pydantic
23
+ import websockets
24
+ import aiohttp
25
+ import asyncio
26
+
27
+ logger : logging.Logger = logging.getLogger("firecrawl")
28
+
29
+ def get_version():
30
+ try:
31
+ from pathlib import Path
32
+ package_path = os.path.dirname(__file__)
33
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
34
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
35
+ if version_match:
36
+ return version_match.group(1).strip()
37
+ except Exception:
38
+ print("Failed to get version from __init__.py")
39
+ return None
40
+
41
+ version = get_version()
42
+
43
+ T = TypeVar('T')
44
+
45
+ # class V1FirecrawlDocumentMetadata(pydantic.BaseModel):
46
+ # """Metadata for a Firecrawl document."""
47
+ # title: Optional[str] = None
48
+ # description: Optional[str] = None
49
+ # language: Optional[str] = None
50
+ # keywords: Optional[str] = None
51
+ # robots: Optional[str] = None
52
+ # ogTitle: Optional[str] = None
53
+ # ogDescription: Optional[str] = None
54
+ # ogUrl: Optional[str] = None
55
+ # ogImage: Optional[str] = None
56
+ # ogAudio: Optional[str] = None
57
+ # ogDeterminer: Optional[str] = None
58
+ # ogLocale: Optional[str] = None
59
+ # ogLocaleAlternate: Optional[List[str]] = None
60
+ # ogSiteName: Optional[str] = None
61
+ # ogVideo: Optional[str] = None
62
+ # dctermsCreated: Optional[str] = None
63
+ # dcDateCreated: Optional[str] = None
64
+ # dcDate: Optional[str] = None
65
+ # dctermsType: Optional[str] = None
66
+ # dcType: Optional[str] = None
67
+ # dctermsAudience: Optional[str] = None
68
+ # dctermsSubject: Optional[str] = None
69
+ # dcSubject: Optional[str] = None
70
+ # dcDescription: Optional[str] = None
71
+ # dctermsKeywords: Optional[str] = None
72
+ # modifiedTime: Optional[str] = None
73
+ # publishedTime: Optional[str] = None
74
+ # articleTag: Optional[str] = None
75
+ # articleSection: Optional[str] = None
76
+ # sourceURL: Optional[str] = None
77
+ # statusCode: Optional[int] = None
78
+ # error: Optional[str] = None
79
+
80
+ class V1AgentOptions(pydantic.BaseModel):
81
+ """Configuration for the agent."""
82
+ model: Literal["FIRE-1"] = "FIRE-1"
83
+ prompt: Optional[str] = None
84
+
85
+ class V1AgentOptionsExtract(pydantic.BaseModel):
86
+ """Configuration for the agent in extract operations."""
87
+ model: Literal["FIRE-1"] = "FIRE-1"
88
+
89
+ class V1ActionsResult(pydantic.BaseModel):
90
+ """Result of actions performed during scraping."""
91
+ screenshots: List[str]
92
+ pdfs: List[str]
93
+
94
+ class V1ChangeTrackingData(pydantic.BaseModel):
95
+ """
96
+ Data for the change tracking format.
97
+ """
98
+ previousScrapeAt: Optional[str] = None
99
+ changeStatus: str # "new" | "same" | "changed" | "removed"
100
+ visibility: str # "visible" | "hidden"
101
+ diff: Optional[Dict[str, Any]] = None
102
+ json_field: Optional[Any] = pydantic.Field(None, alias='json')
103
+
104
+ class V1FirecrawlDocument(pydantic.BaseModel, Generic[T]):
105
+ """Document retrieved or processed by Firecrawl."""
106
+ url: Optional[str] = None
107
+ markdown: Optional[str] = None
108
+ html: Optional[str] = None
109
+ rawHtml: Optional[str] = None
110
+ links: Optional[List[str]] = None
111
+ extract: Optional[T] = None
112
+ json_field: Optional[T] = pydantic.Field(None, alias='json')
113
+ screenshot: Optional[str] = None
114
+ metadata: Optional[Any] = None
115
+ actions: Optional[V1ActionsResult] = None
116
+ title: Optional[str] = None # v1 search only
117
+ description: Optional[str] = None # v1 search only
118
+ changeTracking: Optional[V1ChangeTrackingData] = None
119
+
120
+ class V1LocationConfig(pydantic.BaseModel):
121
+ """Location configuration for scraping."""
122
+ country: Optional[str] = None
123
+ languages: Optional[List[str]] = None
124
+
125
+ class V1WebhookConfig(pydantic.BaseModel):
126
+ """Configuration for webhooks."""
127
+ url: str
128
+ headers: Optional[Dict[str, str]] = None
129
+ metadata: Optional[Dict[str, str]] = None
130
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
131
+
132
+ class V1ChangeTrackingOptions(pydantic.BaseModel):
133
+ """Configuration for change tracking."""
134
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
135
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
136
+ prompt: Optional[str] = None
137
+ tag: Optional[str] = None
138
+
139
+ class V1ScrapeOptions(pydantic.BaseModel):
140
+ """Parameters for scraping operations."""
141
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
142
+ headers: Optional[Dict[str, str]] = None
143
+ includeTags: Optional[List[str]] = None
144
+ excludeTags: Optional[List[str]] = None
145
+ onlyMainContent: Optional[bool] = None
146
+ waitFor: Optional[int] = None
147
+ timeout: Optional[int] = 30000
148
+ location: Optional[V1LocationConfig] = None
149
+ mobile: Optional[bool] = None
150
+ skipTlsVerification: Optional[bool] = None
151
+ removeBase64Images: Optional[bool] = None
152
+ blockAds: Optional[bool] = None
153
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
154
+ changeTrackingOptions: Optional[V1ChangeTrackingOptions] = None
155
+ maxAge: Optional[int] = None
156
+ storeInCache: Optional[bool] = None
157
+ parsePDF: Optional[bool] = None
158
+
159
+ class V1WaitAction(pydantic.BaseModel):
160
+ """Wait action to perform during scraping."""
161
+ type: Literal["wait"]
162
+ milliseconds: Optional[int] = None
163
+ selector: Optional[str] = None
164
+
165
+ class V1ScreenshotAction(pydantic.BaseModel):
166
+ """Screenshot action to perform during scraping."""
167
+ type: Literal["screenshot"]
168
+ fullPage: Optional[bool] = None
169
+ quality: Optional[int] = None
170
+
171
+ class V1ClickAction(pydantic.BaseModel):
172
+ """Click action to perform during scraping."""
173
+ type: Literal["click"]
174
+ selector: str
175
+
176
+ class V1WriteAction(pydantic.BaseModel):
177
+ """Write action to perform during scraping."""
178
+ type: Literal["write"]
179
+ text: str
180
+
181
+ class V1PressAction(pydantic.BaseModel):
182
+ """Press action to perform during scraping."""
183
+ type: Literal["press"]
184
+ key: str
185
+
186
+ class V1ScrollAction(pydantic.BaseModel):
187
+ """Scroll action to perform during scraping."""
188
+ type: Literal["scroll"]
189
+ direction: Literal["up", "down"]
190
+ selector: Optional[str] = None
191
+
192
+ class V1ScrapeAction(pydantic.BaseModel):
193
+ """Scrape action to perform during scraping."""
194
+ type: Literal["scrape"]
195
+
196
+ class V1ExecuteJavascriptAction(pydantic.BaseModel):
197
+ """Execute javascript action to perform during scraping."""
198
+ type: Literal["executeJavascript"]
199
+ script: str
200
+
201
+ class V1PDFAction(pydantic.BaseModel):
202
+ """PDF action to perform during scraping."""
203
+ type: Literal["pdf"]
204
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
205
+ landscape: Optional[bool] = None
206
+ scale: Optional[float] = None
207
+
208
+ class V1ExtractAgent(pydantic.BaseModel):
209
+ """Configuration for the agent in extract operations."""
210
+ model: Literal["FIRE-1"] = "FIRE-1"
211
+
212
+ class V1JsonConfig(pydantic.BaseModel):
213
+ """Configuration for extraction."""
214
+ prompt: Optional[str] = None
215
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
216
+ systemPrompt: Optional[str] = None
217
+ agent: Optional[V1ExtractAgent] = None
218
+
219
+ class V1ScrapeParams(V1ScrapeOptions):
220
+ """Parameters for scraping operations."""
221
+ extract: Optional[V1JsonConfig] = None
222
+ jsonOptions: Optional[V1JsonConfig] = None
223
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None
224
+ agent: Optional[V1AgentOptions] = None
225
+ webhook: Optional[V1WebhookConfig] = None
226
+
227
+ class V1ScrapeResponse(V1FirecrawlDocument[T], Generic[T]):
228
+ """Response from scraping operations."""
229
+ success: bool = True
230
+ warning: Optional[str] = None
231
+ error: Optional[str] = None
232
+
233
+ class V1BatchScrapeResponse(pydantic.BaseModel):
234
+ """Response from batch scrape operations."""
235
+ id: Optional[str] = None
236
+ url: Optional[str] = None
237
+ success: bool = True
238
+ error: Optional[str] = None
239
+ invalidURLs: Optional[List[str]] = None
240
+
241
+ class V1BatchScrapeStatusResponse(pydantic.BaseModel):
242
+ """Response from batch scrape status checks."""
243
+ success: bool = True
244
+ status: Literal["scraping", "completed", "failed", "cancelled"]
245
+ completed: int
246
+ total: int
247
+ creditsUsed: int
248
+ expiresAt: datetime
249
+ next: Optional[str] = None
250
+ data: List[V1FirecrawlDocument]
251
+
252
+ class V1CrawlParams(pydantic.BaseModel):
253
+ """Parameters for crawling operations."""
254
+ includePaths: Optional[List[str]] = None
255
+ excludePaths: Optional[List[str]] = None
256
+ maxDepth: Optional[int] = None
257
+ maxDiscoveryDepth: Optional[int] = None
258
+ limit: Optional[int] = None
259
+ allowBackwardLinks: Optional[bool] = None
260
+ crawlEntireDomain: Optional[bool] = None
261
+ allowExternalLinks: Optional[bool] = None
262
+ ignoreSitemap: Optional[bool] = None
263
+ scrapeOptions: Optional[V1ScrapeOptions] = None
264
+ webhook: Optional[Union[str, V1WebhookConfig]] = None
265
+ deduplicateSimilarURLs: Optional[bool] = None
266
+ ignoreQueryParameters: Optional[bool] = None
267
+ regexOnFullURL: Optional[bool] = None
268
+ delay: Optional[int] = None # Delay in seconds between scrapes
269
+ maxConcurrency: Optional[int] = None
270
+ allowSubdomains: Optional[bool] = None
271
+
272
+ class V1CrawlResponse(pydantic.BaseModel):
273
+ """Response from crawling operations."""
274
+ id: Optional[str] = None
275
+ url: Optional[str] = None
276
+ success: bool = True
277
+ error: Optional[str] = None
278
+
279
+ class V1CrawlStatusResponse(pydantic.BaseModel):
280
+ """Response from crawl status checks."""
281
+ success: bool = True
282
+ status: Literal["scraping", "completed", "failed", "cancelled"]
283
+ completed: int
284
+ total: int
285
+ creditsUsed: int
286
+ expiresAt: datetime
287
+ next: Optional[str] = None
288
+ data: List[V1FirecrawlDocument]
289
+
290
+ class V1CrawlError(pydantic.BaseModel):
291
+ """A crawl error."""
292
+ id: str
293
+ timestamp: Optional[datetime] = None
294
+ url: str
295
+ code: Optional[str] = None
296
+ error: str
297
+
298
+ class V1CrawlErrorsResponse(pydantic.BaseModel):
299
+ """Response from crawl/batch scrape error monitoring."""
300
+ errors: List[V1CrawlError]
301
+ robotsBlocked: List[str]
302
+
303
+ class V1MapParams(pydantic.BaseModel):
304
+ """Parameters for mapping operations."""
305
+ search: Optional[str] = None
306
+ ignoreSitemap: Optional[bool] = None
307
+ includeSubdomains: Optional[bool] = None
308
+ sitemapOnly: Optional[bool] = None
309
+ limit: Optional[int] = None
310
+ timeout: Optional[int] = 30000
311
+ useIndex: Optional[bool] = None
312
+
313
+ class V1MapResponse(pydantic.BaseModel):
314
+ """Response from mapping operations."""
315
+ success: bool = True
316
+ links: Optional[List[str]] = None
317
+ error: Optional[str] = None
318
+
319
+ class V1ExtractParams(pydantic.BaseModel):
320
+ """Parameters for extracting information from URLs."""
321
+ prompt: Optional[str] = None
322
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
323
+ systemPrompt: Optional[str] = None
324
+ allowExternalLinks: Optional[bool] = None
325
+ enableWebSearch: Optional[bool] = None
326
+ includeSubdomains: Optional[bool] = None
327
+ origin: Optional[str] = None
328
+ showSources: Optional[bool] = None
329
+ scrapeOptions: Optional[V1ScrapeOptions] = None
330
+
331
+ class V1ExtractResponse(pydantic.BaseModel, Generic[T]):
332
+ """Response from extract operations."""
333
+ id: Optional[str] = None
334
+ status: Optional[Literal["processing", "completed", "failed"]] = None
335
+ expiresAt: Optional[datetime] = None
336
+ success: bool = True
337
+ data: Optional[T] = None
338
+ error: Optional[str] = None
339
+ warning: Optional[str] = None
340
+ sources: Optional[Dict[Any, Any]] = None
341
+
342
+ class V1SearchParams(pydantic.BaseModel):
343
+ query: str
344
+ limit: Optional[int] = 5
345
+ tbs: Optional[str] = None
346
+ filter: Optional[str] = None
347
+ lang: Optional[str] = "en"
348
+ country: Optional[str] = "us"
349
+ location: Optional[str] = None
350
+ origin: Optional[str] = "api"
351
+ timeout: Optional[int] = 60000
352
+ scrapeOptions: Optional[V1ScrapeOptions] = None
353
+
354
+ class V1SearchResponse(pydantic.BaseModel):
355
+ """Response from search operations."""
356
+ success: bool = True
357
+ data: List[V1FirecrawlDocument]
358
+ warning: Optional[str] = None
359
+ error: Optional[str] = None
360
+
361
+ class V1GenerateLLMsTextParams(pydantic.BaseModel):
362
+ """
363
+ Parameters for the LLMs.txt generation operation.
364
+ """
365
+ maxUrls: Optional[int] = 10
366
+ showFullText: Optional[bool] = False
367
+ cache: Optional[bool] = True
368
+ __experimental_stream: Optional[bool] = None
369
+
370
+ class V1DeepResearchParams(pydantic.BaseModel):
371
+ """
372
+ Parameters for the deep research operation.
373
+ """
374
+ maxDepth: Optional[int] = 7
375
+ timeLimit: Optional[int] = 270
376
+ maxUrls: Optional[int] = 20
377
+ analysisPrompt: Optional[str] = None
378
+ systemPrompt: Optional[str] = None
379
+ __experimental_streamSteps: Optional[bool] = None
380
+
381
+ class V1DeepResearchResponse(pydantic.BaseModel):
382
+ """
383
+ Response from the deep research operation.
384
+ """
385
+ success: bool
386
+ id: str
387
+ error: Optional[str] = None
388
+
389
+ class V1DeepResearchStatusResponse(pydantic.BaseModel):
390
+ """
391
+ Status response from the deep research operation.
392
+ """
393
+ success: bool
394
+ data: Optional[Dict[str, Any]] = None
395
+ status: str
396
+ error: Optional[str] = None
397
+ expiresAt: str
398
+ currentDepth: int
399
+ maxDepth: int
400
+ activities: List[Dict[str, Any]]
401
+ sources: List[Dict[str, Any]]
402
+ summaries: List[str]
403
+
404
+ class V1GenerateLLMsTextResponse(pydantic.BaseModel):
405
+ """Response from LLMs.txt generation operations."""
406
+ success: bool = True
407
+ id: str
408
+ error: Optional[str] = None
409
+
410
+ class V1GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
411
+ llmstxt: str
412
+ llmsfulltxt: Optional[str] = None
413
+
414
+ class V1GenerateLLMsTextStatusResponse(pydantic.BaseModel):
415
+ """Status response from LLMs.txt generation operations."""
416
+ success: bool = True
417
+ data: Optional[V1GenerateLLMsTextStatusResponseData] = None
418
+ status: Literal["processing", "completed", "failed"]
419
+ error: Optional[str] = None
420
+ expiresAt: str
421
+
422
+ class V1SearchResponse(pydantic.BaseModel):
423
+ """
424
+ Response from the search operation.
425
+ """
426
+ success: bool
427
+ data: List[Dict[str, Any]]
428
+ warning: Optional[str] = None
429
+ error: Optional[str] = None
430
+
431
+ class V1ExtractParams(pydantic.BaseModel):
432
+ """
433
+ Parameters for the extract operation.
434
+ """
435
+ prompt: Optional[str] = None
436
+ schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
437
+ system_prompt: Optional[str] = None
438
+ allow_external_links: Optional[bool] = False
439
+ enable_web_search: Optional[bool] = False
440
+ # Just for backwards compatibility
441
+ enableWebSearch: Optional[bool] = False
442
+ show_sources: Optional[bool] = False
443
+ agent: Optional[Dict[str, Any]] = None
444
+
445
+ class V1FirecrawlApp:
446
+ """
447
+ Legacy v1 Firecrawl client for backward compatibility.
448
+
449
+ This class provides the complete v1 API implementation including:
450
+ - URL scraping with various formats and options
451
+ - Website crawling with monitoring capabilities
452
+ - Batch scraping operations
453
+ - Search functionality
454
+ - Data extraction with LLM integration
455
+ - Deep research capabilities
456
+ - LLMs.txt generation
457
+
458
+ This is used by the unified client to provide version-specific access
459
+ through app.v1.method_name() patterns.
460
+ """
461
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
462
+ """
463
+ Initialize the V1FirecrawlApp instance with API key, API URL.
464
+
465
+ Args:
466
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
467
+ api_url (Optional[str]): Base URL for the Firecrawl API.
468
+ """
469
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
470
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
471
+
472
+ # Only require API key when using cloud service
473
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
474
+ logger.warning("No API key provided for cloud service")
475
+ raise ValueError('No API key provided')
476
+
477
+ logger.debug(f"Initialized V1FirecrawlApp with API URL: {self.api_url}")
478
+
479
+ def scrape_url(
480
+ self,
481
+ url: str,
482
+ *,
483
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
484
+ headers: Optional[Dict[str, str]] = None,
485
+ include_tags: Optional[List[str]] = None,
486
+ exclude_tags: Optional[List[str]] = None,
487
+ only_main_content: Optional[bool] = None,
488
+ wait_for: Optional[int] = None,
489
+ timeout: Optional[int] = 30000,
490
+ location: Optional[V1LocationConfig] = None,
491
+ mobile: Optional[bool] = None,
492
+ skip_tls_verification: Optional[bool] = None,
493
+ remove_base64_images: Optional[bool] = None,
494
+ block_ads: Optional[bool] = None,
495
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
496
+ parse_pdf: Optional[bool] = None,
497
+ extract: Optional[V1JsonConfig] = None,
498
+ json_options: Optional[V1JsonConfig] = None,
499
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
500
+ change_tracking_options: Optional[V1ChangeTrackingOptions] = None,
501
+ max_age: Optional[int] = None,
502
+ store_in_cache: Optional[bool] = None,
503
+ zero_data_retention: Optional[bool] = None,
504
+ **kwargs) -> V1ScrapeResponse[Any]:
505
+ """
506
+ Scrape and extract content from a URL.
507
+
508
+ Args:
509
+ url (str): Target URL to scrape
510
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
511
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
512
+ include_tags (Optional[List[str]]): HTML tags to include
513
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
514
+ only_main_content (Optional[bool]): Extract main content only
515
+ wait_for (Optional[int]): Wait for a specific element to appear
516
+ timeout (Optional[int]): Request timeout (ms)
517
+ location (Optional[LocationConfig]): Location configuration
518
+ mobile (Optional[bool]): Use mobile user agent
519
+ skip_tls_verification (Optional[bool]): Skip TLS verification
520
+ remove_base64_images (Optional[bool]): Remove base64 images
521
+ block_ads (Optional[bool]): Block ads
522
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
523
+ extract (Optional[JsonConfig]): Content extraction settings
524
+ json_options (Optional[JsonConfig]): JSON extraction settings
525
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
526
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
527
+ zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
528
+
529
+
530
+ Returns:
531
+ ScrapeResponse with:
532
+ * Requested content formats
533
+ * Page metadata
534
+ * Extraction results
535
+ * Success/error status
536
+
537
+ Raises:
538
+ Exception: If scraping fails
539
+ """
540
+ # Validate any additional kwargs
541
+ self._validate_kwargs(kwargs, "scrape_url")
542
+
543
+ _headers = self._prepare_headers()
544
+
545
+ # Build scrape parameters
546
+ scrape_params = {
547
+ 'url': url,
548
+ 'origin': f"python-sdk@{version}"
549
+ }
550
+
551
+ # Add optional parameters if provided
552
+ if formats:
553
+ scrape_params['formats'] = formats
554
+ if headers:
555
+ scrape_params['headers'] = headers
556
+ if include_tags:
557
+ scrape_params['includeTags'] = include_tags
558
+ if exclude_tags:
559
+ scrape_params['excludeTags'] = exclude_tags
560
+ if only_main_content is not None:
561
+ scrape_params['onlyMainContent'] = only_main_content
562
+ if wait_for:
563
+ scrape_params['waitFor'] = wait_for
564
+ if timeout:
565
+ scrape_params['timeout'] = timeout
566
+ if location:
567
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
568
+ if mobile is not None:
569
+ scrape_params['mobile'] = mobile
570
+ if skip_tls_verification is not None:
571
+ scrape_params['skipTlsVerification'] = skip_tls_verification
572
+ if remove_base64_images is not None:
573
+ scrape_params['removeBase64Images'] = remove_base64_images
574
+ if block_ads is not None:
575
+ scrape_params['blockAds'] = block_ads
576
+ if proxy:
577
+ scrape_params['proxy'] = proxy
578
+ if parse_pdf is not None:
579
+ scrape_params['parsePDF'] = parse_pdf
580
+ if extract is not None:
581
+ extract = self._ensure_schema_dict(extract)
582
+ if isinstance(extract, dict) and "schema" in extract:
583
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
584
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
585
+ if json_options is not None:
586
+ json_options = self._ensure_schema_dict(json_options)
587
+ if isinstance(json_options, dict) and "schema" in json_options:
588
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
589
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
590
+ if actions:
591
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
592
+ if change_tracking_options:
593
+ scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(by_alias=True, exclude_none=True)
594
+ if max_age is not None:
595
+ scrape_params['maxAge'] = max_age
596
+ if store_in_cache is not None:
597
+ scrape_params['storeInCache'] = store_in_cache
598
+ if zero_data_retention is not None:
599
+ scrape_params['zeroDataRetention'] = zero_data_retention
600
+
601
+ scrape_params.update(kwargs)
602
+
603
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
604
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
605
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
606
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
607
+
608
+ # Make request
609
+ response = requests.post(
610
+ f'{self.api_url}/v1/scrape',
611
+ headers=_headers,
612
+ json=scrape_params,
613
+ timeout=(timeout / 1000.0 + 5 if timeout is not None else None)
614
+ )
615
+
616
+ if response.status_code == 200:
617
+ try:
618
+ response_json = response.json()
619
+ if response_json.get('success') and 'data' in response_json:
620
+ return V1ScrapeResponse(**response_json['data'])
621
+ elif "error" in response_json:
622
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
623
+ else:
624
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
625
+ except ValueError:
626
+ raise Exception('Failed to parse Firecrawl response as JSON.')
627
+ else:
628
+ self._handle_error(response, 'scrape URL')
629
+
630
+ def search(
631
+ self,
632
+ query: str,
633
+ *,
634
+ limit: Optional[int] = None,
635
+ tbs: Optional[str] = None,
636
+ filter: Optional[str] = None,
637
+ lang: Optional[str] = None,
638
+ country: Optional[str] = None,
639
+ location: Optional[str] = None,
640
+ timeout: Optional[int] = 30000,
641
+ scrape_options: Optional[V1ScrapeOptions] = None,
642
+ **kwargs) -> V1SearchResponse:
643
+ """
644
+ Search for content using Firecrawl.
645
+
646
+ Args:
647
+ query (str): Search query string
648
+ limit (Optional[int]): Max results (default: 5)
649
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
650
+ filter (Optional[str]): Custom result filter
651
+ lang (Optional[str]): Language code (default: "en")
652
+ country (Optional[str]): Country code (default: "us")
653
+ location (Optional[str]): Geo-targeting
654
+ timeout (Optional[int]): Request timeout in milliseconds
655
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
656
+ **kwargs: Additional keyword arguments for future compatibility
657
+
658
+ Returns:
659
+ SearchResponse: Response containing:
660
+ * success (bool): Whether request succeeded
661
+ * data (List[FirecrawlDocument]): Search results
662
+ * warning (Optional[str]): Warning message if any
663
+ * error (Optional[str]): Error message if any
664
+
665
+ Raises:
666
+ Exception: If search fails or response cannot be parsed
667
+ """
668
+ # Validate any additional kwargs
669
+ self._validate_kwargs(kwargs, "search")
670
+
671
+ # Build search parameters
672
+ search_params = {}
673
+
674
+ # Add individual parameters
675
+ if limit is not None:
676
+ search_params['limit'] = limit
677
+ if tbs is not None:
678
+ search_params['tbs'] = tbs
679
+ if filter is not None:
680
+ search_params['filter'] = filter
681
+ if lang is not None:
682
+ search_params['lang'] = lang
683
+ if country is not None:
684
+ search_params['country'] = country
685
+ if location is not None:
686
+ search_params['location'] = location
687
+ if timeout is not None:
688
+ search_params['timeout'] = timeout
689
+ if scrape_options is not None:
690
+ search_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
691
+
692
+ # Add any additional kwargs
693
+ search_params.update(kwargs)
694
+ _integration = search_params.get('integration')
695
+
696
+ # Create final params object
697
+ final_params = V1SearchParams(query=query, **search_params)
698
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
699
+ params_dict['origin'] = f"python-sdk@{version}"
700
+
701
+ if _integration:
702
+ params_dict['integration'] = _integration
703
+
704
+ # Make request
705
+ response = requests.post(
706
+ f"{self.api_url}/v1/search",
707
+ headers={"Authorization": f"Bearer {self.api_key}"},
708
+ json=params_dict
709
+ )
710
+
711
+ if response.status_code == 200:
712
+ try:
713
+ response_json = response.json()
714
+ if response_json.get('success') and 'data' in response_json:
715
+ return V1SearchResponse(**response_json)
716
+ elif "error" in response_json:
717
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
718
+ else:
719
+ raise Exception(f'Search failed. Error: {response_json}')
720
+ except ValueError:
721
+ raise Exception('Failed to parse Firecrawl response as JSON.')
722
+ else:
723
+ self._handle_error(response, 'search')
724
+
725
+ def crawl_url(
726
+ self,
727
+ url: str,
728
+ *,
729
+ include_paths: Optional[List[str]] = None,
730
+ exclude_paths: Optional[List[str]] = None,
731
+ max_depth: Optional[int] = None,
732
+ max_discovery_depth: Optional[int] = None,
733
+ limit: Optional[int] = None,
734
+ allow_backward_links: Optional[bool] = None,
735
+ crawl_entire_domain: Optional[bool] = None,
736
+ allow_external_links: Optional[bool] = None,
737
+ ignore_sitemap: Optional[bool] = None,
738
+ scrape_options: Optional[V1ScrapeOptions] = None,
739
+ webhook: Optional[Union[str, V1WebhookConfig]] = None,
740
+ deduplicate_similar_urls: Optional[bool] = None,
741
+ ignore_query_parameters: Optional[bool] = None,
742
+ regex_on_full_url: Optional[bool] = None,
743
+ delay: Optional[int] = None,
744
+ allow_subdomains: Optional[bool] = None,
745
+ max_concurrency: Optional[int] = None,
746
+ zero_data_retention: Optional[bool] = None,
747
+ poll_interval: Optional[int] = 2,
748
+ idempotency_key: Optional[str] = None,
749
+ **kwargs
750
+ ) -> V1CrawlStatusResponse:
751
+ """
752
+ Crawl a website starting from a URL.
753
+
754
+ Args:
755
+ url (str): Target URL to start crawling from
756
+ include_paths (Optional[List[str]]): Patterns of URLs to include
757
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
758
+ max_depth (Optional[int]): Maximum crawl depth
759
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
760
+ limit (Optional[int]): Maximum pages to crawl
761
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
762
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
763
+ allow_external_links (Optional[bool]): Follow external domain links
764
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
765
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
766
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
767
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
768
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
769
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
770
+ delay (Optional[int]): Delay in seconds between scrapes
771
+ allow_subdomains (Optional[bool]): Follow subdomains
772
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
773
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
774
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
775
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
776
+ **kwargs: Additional parameters to pass to the API
777
+
778
+ Returns:
779
+ CrawlStatusResponse with:
780
+ * Crawling status and progress
781
+ * Crawled page contents
782
+ * Success/error information
783
+
784
+ Raises:
785
+ Exception: If crawl fails
786
+ """
787
+ # Validate any additional kwargs
788
+ self._validate_kwargs(kwargs, "crawl_url")
789
+
790
+ crawl_params = {}
791
+
792
+ # Add individual parameters
793
+ if include_paths is not None:
794
+ crawl_params['includePaths'] = include_paths
795
+ if exclude_paths is not None:
796
+ crawl_params['excludePaths'] = exclude_paths
797
+ if max_depth is not None:
798
+ crawl_params['maxDepth'] = max_depth
799
+ if max_discovery_depth is not None:
800
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
801
+ if limit is not None:
802
+ crawl_params['limit'] = limit
803
+ if crawl_entire_domain is not None:
804
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
805
+ elif allow_backward_links is not None:
806
+ crawl_params['allowBackwardLinks'] = allow_backward_links
807
+ if allow_external_links is not None:
808
+ crawl_params['allowExternalLinks'] = allow_external_links
809
+ if ignore_sitemap is not None:
810
+ crawl_params['ignoreSitemap'] = ignore_sitemap
811
+ if scrape_options is not None:
812
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
813
+ if webhook is not None:
814
+ crawl_params['webhook'] = webhook
815
+ if deduplicate_similar_urls is not None:
816
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
817
+ if ignore_query_parameters is not None:
818
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
819
+ if regex_on_full_url is not None:
820
+ crawl_params['regexOnFullURL'] = regex_on_full_url
821
+ if delay is not None:
822
+ crawl_params['delay'] = delay
823
+ if allow_subdomains is not None:
824
+ crawl_params['allowSubdomains'] = allow_subdomains
825
+ if max_concurrency is not None:
826
+ crawl_params['maxConcurrency'] = max_concurrency
827
+ if zero_data_retention is not None:
828
+ crawl_params['zeroDataRetention'] = zero_data_retention
829
+ # Add any additional kwargs
830
+ crawl_params.update(kwargs)
831
+ _integration = crawl_params.get('integration')
832
+
833
+ # Create final params object
834
+ final_params = V1CrawlParams(**crawl_params)
835
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
836
+ params_dict['url'] = url
837
+ params_dict['origin'] = f"python-sdk@{version}"
838
+
839
+ if _integration:
840
+ params_dict['integration'] = _integration
841
+
842
+ # Make request
843
+ headers = self._prepare_headers(idempotency_key)
844
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
845
+
846
+ if response.status_code == 200:
847
+ try:
848
+ id = response.json().get('id')
849
+ except:
850
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
851
+ return self._monitor_job_status(id, headers, poll_interval)
852
+ else:
853
+ self._handle_error(response, 'start crawl job')
854
+
855
+ def async_crawl_url(
856
+ self,
857
+ url: str,
858
+ *,
859
+ include_paths: Optional[List[str]] = None,
860
+ exclude_paths: Optional[List[str]] = None,
861
+ max_depth: Optional[int] = None,
862
+ max_discovery_depth: Optional[int] = None,
863
+ limit: Optional[int] = None,
864
+ allow_backward_links: Optional[bool] = None,
865
+ crawl_entire_domain: Optional[bool] = None,
866
+ allow_external_links: Optional[bool] = None,
867
+ ignore_sitemap: Optional[bool] = None,
868
+ scrape_options: Optional[V1ScrapeOptions] = None,
869
+ webhook: Optional[Union[str, V1WebhookConfig]] = None,
870
+ deduplicate_similar_urls: Optional[bool] = None,
871
+ ignore_query_parameters: Optional[bool] = None,
872
+ regex_on_full_url: Optional[bool] = None,
873
+ delay: Optional[int] = None,
874
+ allow_subdomains: Optional[bool] = None,
875
+ max_concurrency: Optional[int] = None,
876
+ zero_data_retention: Optional[bool] = None,
877
+ idempotency_key: Optional[str] = None,
878
+ **kwargs
879
+ ) -> V1CrawlResponse:
880
+ """
881
+ Start an asynchronous crawl job.
882
+
883
+ Args:
884
+ url (str): Target URL to start crawling from
885
+ include_paths (Optional[List[str]]): Patterns of URLs to include
886
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
887
+ max_depth (Optional[int]): Maximum crawl depth
888
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
889
+ limit (Optional[int]): Maximum pages to crawl
890
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
891
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
892
+ allow_external_links (Optional[bool]): Follow external domain links
893
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
894
+ scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
895
+ webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
896
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
897
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
898
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
899
+ delay (Optional[int]): Delay in seconds between scrapes
900
+ allow_subdomains (Optional[bool]): Follow subdomains
901
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
902
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
903
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
904
+ **kwargs: Additional parameters to pass to the API
905
+
906
+ Returns:
907
+ V1CrawlResponse with:
908
+ * success - Whether crawl started successfully
909
+ * id - Unique identifier for the crawl job
910
+ * url - Status check URL for the crawl
911
+ * error - Error message if start failed
912
+
913
+ Raises:
914
+ Exception: If crawl initiation fails
915
+ """
916
+ # Validate any additional kwargs
917
+ self._validate_kwargs(kwargs, "async_crawl_url")
918
+
919
+ crawl_params = {}
920
+
921
+ # Add individual parameters
922
+ if include_paths is not None:
923
+ crawl_params['includePaths'] = include_paths
924
+ if exclude_paths is not None:
925
+ crawl_params['excludePaths'] = exclude_paths
926
+ if max_depth is not None:
927
+ crawl_params['maxDepth'] = max_depth
928
+ if max_discovery_depth is not None:
929
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
930
+ if limit is not None:
931
+ crawl_params['limit'] = limit
932
+ if crawl_entire_domain is not None:
933
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
934
+ elif allow_backward_links is not None:
935
+ crawl_params['allowBackwardLinks'] = allow_backward_links
936
+ if allow_external_links is not None:
937
+ crawl_params['allowExternalLinks'] = allow_external_links
938
+ if ignore_sitemap is not None:
939
+ crawl_params['ignoreSitemap'] = ignore_sitemap
940
+ if scrape_options is not None:
941
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
942
+ if webhook is not None:
943
+ crawl_params['webhook'] = webhook
944
+ if deduplicate_similar_urls is not None:
945
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
946
+ if ignore_query_parameters is not None:
947
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
948
+ if regex_on_full_url is not None:
949
+ crawl_params['regexOnFullURL'] = regex_on_full_url
950
+ if delay is not None:
951
+ crawl_params['delay'] = delay
952
+ if allow_subdomains is not None:
953
+ crawl_params['allowSubdomains'] = allow_subdomains
954
+ if max_concurrency is not None:
955
+ crawl_params['maxConcurrency'] = max_concurrency
956
+ if zero_data_retention is not None:
957
+ crawl_params['zeroDataRetention'] = zero_data_retention
958
+ # Add any additional kwargs
959
+ crawl_params.update(kwargs)
960
+
961
+ # Create final params object
962
+ final_params = V1CrawlParams(**crawl_params)
963
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
964
+ params_dict['url'] = url
965
+ params_dict['origin'] = f"python-sdk@{version}"
966
+
967
+ # Make request
968
+ headers = self._prepare_headers(idempotency_key)
969
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
970
+
971
+ if response.status_code == 200:
972
+ try:
973
+ return V1CrawlResponse(**response.json())
974
+ except:
975
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
976
+ else:
977
+ self._handle_error(response, 'start crawl job')
978
+
979
+ def check_crawl_status(self, id: str) -> V1CrawlStatusResponse:
980
+ """
981
+ Check the status and results of a crawl job.
982
+
983
+ Args:
984
+ id: Unique identifier for the crawl job
985
+
986
+ Returns:
987
+ V1CrawlStatusResponse containing:
988
+
989
+ Status Information:
990
+ * status - Current state (scraping/completed/failed/cancelled)
991
+ * completed - Number of pages crawled
992
+ * total - Total pages to crawl
993
+ * creditsUsed - API credits consumed
994
+ * expiresAt - Data expiration timestamp
995
+
996
+ Results:
997
+ * data - List of crawled documents
998
+ * next - URL for next page of results (if paginated)
999
+ * success - Whether status check succeeded
1000
+ * error - Error message if failed
1001
+
1002
+ Raises:
1003
+ Exception: If status check fails
1004
+ """
1005
+ endpoint = f'/v1/crawl/{id}'
1006
+
1007
+ headers = self._prepare_headers()
1008
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1009
+ if response.status_code == 200:
1010
+ try:
1011
+ status_data = response.json()
1012
+ except:
1013
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1014
+ if status_data['status'] == 'completed':
1015
+ if 'data' in status_data:
1016
+ data = status_data['data']
1017
+ while 'next' in status_data:
1018
+ if len(status_data['data']) == 0:
1019
+ break
1020
+ next_url = status_data.get('next')
1021
+ if not next_url:
1022
+ logger.warning("Expected 'next' URL is missing.")
1023
+ break
1024
+ try:
1025
+ status_response = self._get_request(next_url, headers)
1026
+ if status_response.status_code != 200:
1027
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1028
+ break
1029
+ try:
1030
+ next_data = status_response.json()
1031
+ except:
1032
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1033
+ data.extend(next_data.get('data', []))
1034
+ status_data = next_data
1035
+ except Exception as e:
1036
+ logger.error(f"Error during pagination request: {e}")
1037
+ break
1038
+ status_data['data'] = data
1039
+
1040
+ response = {
1041
+ 'status': status_data.get('status'),
1042
+ 'total': status_data.get('total'),
1043
+ 'completed': status_data.get('completed'),
1044
+ 'creditsUsed': status_data.get('creditsUsed'),
1045
+ 'expiresAt': status_data.get('expiresAt'),
1046
+ 'data': status_data.get('data')
1047
+ }
1048
+
1049
+ if 'error' in status_data:
1050
+ response['error'] = status_data['error']
1051
+
1052
+ if 'next' in status_data:
1053
+ response['next'] = status_data['next']
1054
+
1055
+ return V1CrawlStatusResponse(
1056
+ success=False if 'error' in status_data else True,
1057
+ **response
1058
+ )
1059
+ else:
1060
+ self._handle_error(response, 'check crawl status')
1061
+
1062
+ def check_crawl_errors(self, id: str) -> V1CrawlErrorsResponse:
1063
+ """
1064
+ Returns information about crawl errors.
1065
+
1066
+ Args:
1067
+ id (str): The ID of the crawl job
1068
+
1069
+ Returns:
1070
+ V1CrawlErrorsResponse containing:
1071
+ * errors (List[Dict[str, str]]): List of errors with fields:
1072
+ - id (str): Error ID
1073
+ - timestamp (str): When the error occurred
1074
+ - url (str): URL that caused the error
1075
+ - error (str): Error message
1076
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1077
+
1078
+ Raises:
1079
+ Exception: If error check fails
1080
+ """
1081
+ headers = self._prepare_headers()
1082
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1083
+ if response.status_code == 200:
1084
+ try:
1085
+ return V1CrawlErrorsResponse(**response.json())
1086
+ except:
1087
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1088
+ else:
1089
+ self._handle_error(response, "check crawl errors")
1090
+
1091
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
1092
+ """
1093
+ Cancel an asynchronous crawl job.
1094
+
1095
+ Args:
1096
+ id (str): The ID of the crawl job to cancel
1097
+
1098
+ Returns:
1099
+ Dict[str, Any] containing:
1100
+ * success (bool): Whether cancellation was successful
1101
+ * error (str, optional): Error message if cancellation failed
1102
+
1103
+ Raises:
1104
+ Exception: If cancellation fails
1105
+ """
1106
+ headers = self._prepare_headers()
1107
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1108
+ if response.status_code == 200:
1109
+ try:
1110
+ return response.json()
1111
+ except:
1112
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1113
+ else:
1114
+ self._handle_error(response, "cancel crawl job")
1115
+
1116
+ def crawl_url_and_watch(
1117
+ self,
1118
+ url: str,
1119
+ *,
1120
+ include_paths: Optional[List[str]] = None,
1121
+ exclude_paths: Optional[List[str]] = None,
1122
+ max_depth: Optional[int] = None,
1123
+ max_discovery_depth: Optional[int] = None,
1124
+ limit: Optional[int] = None,
1125
+ allow_backward_links: Optional[bool] = None,
1126
+ crawl_entire_domain: Optional[bool] = None,
1127
+ allow_external_links: Optional[bool] = None,
1128
+ ignore_sitemap: Optional[bool] = None,
1129
+ scrape_options: Optional[V1ScrapeOptions] = None,
1130
+ webhook: Optional[Union[str, V1WebhookConfig]] = None,
1131
+ deduplicate_similar_urls: Optional[bool] = None,
1132
+ ignore_query_parameters: Optional[bool] = None,
1133
+ regex_on_full_url: Optional[bool] = None,
1134
+ delay: Optional[int] = None,
1135
+ allow_subdomains: Optional[bool] = None,
1136
+ max_concurrency: Optional[int] = None,
1137
+ zero_data_retention: Optional[bool] = None,
1138
+ idempotency_key: Optional[str] = None,
1139
+ **kwargs
1140
+ ) -> 'V1CrawlWatcher':
1141
+ """
1142
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1143
+
1144
+ Args:
1145
+ url (str): Target URL to start crawling from
1146
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1147
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1148
+ max_depth (Optional[int]): Maximum crawl depth
1149
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1150
+ limit (Optional[int]): Maximum pages to crawl
1151
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1152
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
1153
+ allow_external_links (Optional[bool]): Follow external domain links
1154
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1155
+ scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
1156
+ webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
1157
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1158
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1159
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1160
+ delay (Optional[int]): Delay in seconds between scrapes
1161
+ allow_subdomains (Optional[bool]): Follow subdomains
1162
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1163
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1164
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1165
+ **kwargs: Additional parameters to pass to the API
1166
+
1167
+ Returns:
1168
+ V1CrawlWatcher: An instance to monitor the crawl job via WebSocket
1169
+
1170
+ Raises:
1171
+ Exception: If crawl job fails to start
1172
+ """
1173
+ crawl_response = self.async_crawl_url(
1174
+ url,
1175
+ include_paths=include_paths,
1176
+ exclude_paths=exclude_paths,
1177
+ max_depth=max_depth,
1178
+ max_discovery_depth=max_discovery_depth,
1179
+ limit=limit,
1180
+ allow_backward_links=allow_backward_links,
1181
+ crawl_entire_domain=crawl_entire_domain,
1182
+ allow_external_links=allow_external_links,
1183
+ ignore_sitemap=ignore_sitemap,
1184
+ scrape_options=scrape_options,
1185
+ webhook=webhook,
1186
+ deduplicate_similar_urls=deduplicate_similar_urls,
1187
+ ignore_query_parameters=ignore_query_parameters,
1188
+ regex_on_full_url=regex_on_full_url,
1189
+ delay=delay,
1190
+ allow_subdomains=allow_subdomains,
1191
+ max_concurrency=max_concurrency,
1192
+ zero_data_retention=zero_data_retention,
1193
+ idempotency_key=idempotency_key,
1194
+ **kwargs
1195
+ )
1196
+ if crawl_response.success and crawl_response.id:
1197
+ return V1CrawlWatcher(crawl_response.id, self)
1198
+ else:
1199
+ raise Exception("Crawl job failed to start")
1200
+
1201
+ def map_url(
1202
+ self,
1203
+ url: str,
1204
+ *,
1205
+ search: Optional[str] = None,
1206
+ ignore_sitemap: Optional[bool] = None,
1207
+ include_subdomains: Optional[bool] = None,
1208
+ sitemap_only: Optional[bool] = None,
1209
+ limit: Optional[int] = None,
1210
+ timeout: Optional[int] = 30000,
1211
+ use_index: Optional[bool] = None,
1212
+ **kwargs) -> V1MapResponse:
1213
+ """
1214
+ Map and discover links from a URL.
1215
+
1216
+ Args:
1217
+ url (str): Target URL to map
1218
+ search (Optional[str]): Filter pattern for URLs
1219
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1220
+ include_subdomains (Optional[bool]): Include subdomain links
1221
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1222
+ limit (Optional[int]): Maximum URLs to return
1223
+ timeout (Optional[int]): Request timeout in milliseconds
1224
+ **kwargs: Additional parameters to pass to the API
1225
+
1226
+ Returns:
1227
+ V1MapResponse: Response containing:
1228
+ * success (bool): Whether request succeeded
1229
+ * links (List[str]): Discovered URLs
1230
+ * error (Optional[str]): Error message if any
1231
+
1232
+ Raises:
1233
+ Exception: If mapping fails or response cannot be parsed
1234
+ """
1235
+ # Validate any additional kwargs
1236
+ self._validate_kwargs(kwargs, "map_url")
1237
+
1238
+ # Build map parameters
1239
+ map_params = {}
1240
+
1241
+ # Add individual parameters
1242
+ if search is not None:
1243
+ map_params['search'] = search
1244
+ if ignore_sitemap is not None:
1245
+ map_params['ignoreSitemap'] = ignore_sitemap
1246
+ if include_subdomains is not None:
1247
+ map_params['includeSubdomains'] = include_subdomains
1248
+ if sitemap_only is not None:
1249
+ map_params['sitemapOnly'] = sitemap_only
1250
+ if limit is not None:
1251
+ map_params['limit'] = limit
1252
+ if timeout is not None:
1253
+ map_params['timeout'] = timeout
1254
+ if use_index is not None:
1255
+ map_params['useIndex'] = use_index
1256
+
1257
+ # Add any additional kwargs
1258
+ map_params.update(kwargs)
1259
+ _integration = map_params.get('integration')
1260
+
1261
+ # Create final params object
1262
+ final_params = V1MapParams(**map_params)
1263
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1264
+ params_dict['url'] = url
1265
+ params_dict['origin'] = f"python-sdk@{version}"
1266
+
1267
+ if _integration:
1268
+ params_dict['integration'] = _integration
1269
+
1270
+ # Make request
1271
+ response = requests.post(
1272
+ f"{self.api_url}/v1/map",
1273
+ headers={"Authorization": f"Bearer {self.api_key}"},
1274
+ json=params_dict
1275
+ )
1276
+
1277
+ if response.status_code == 200:
1278
+ try:
1279
+ response_json = response.json()
1280
+ if response_json.get('success') and 'links' in response_json:
1281
+ return V1MapResponse(**response_json)
1282
+ elif "error" in response_json:
1283
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1284
+ else:
1285
+ raise Exception(f'Map failed. Error: {response_json}')
1286
+ except ValueError:
1287
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1288
+ else:
1289
+ self._handle_error(response, 'map')
1290
+
1291
+ def batch_scrape_urls(
1292
+ self,
1293
+ urls: List[str],
1294
+ *,
1295
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1296
+ headers: Optional[Dict[str, str]] = None,
1297
+ include_tags: Optional[List[str]] = None,
1298
+ exclude_tags: Optional[List[str]] = None,
1299
+ only_main_content: Optional[bool] = None,
1300
+ wait_for: Optional[int] = None,
1301
+ timeout: Optional[int] = 30000,
1302
+ location: Optional[V1LocationConfig] = None,
1303
+ mobile: Optional[bool] = None,
1304
+ skip_tls_verification: Optional[bool] = None,
1305
+ remove_base64_images: Optional[bool] = None,
1306
+ block_ads: Optional[bool] = None,
1307
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1308
+ extract: Optional[V1JsonConfig] = None,
1309
+ json_options: Optional[V1JsonConfig] = None,
1310
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
1311
+ agent: Optional[V1AgentOptions] = None,
1312
+ poll_interval: Optional[int] = 2,
1313
+ max_concurrency: Optional[int] = None,
1314
+ zero_data_retention: Optional[bool] = None,
1315
+ idempotency_key: Optional[str] = None,
1316
+ **kwargs
1317
+ ) -> V1BatchScrapeStatusResponse:
1318
+ """
1319
+ Batch scrape multiple URLs and monitor until completion.
1320
+
1321
+ Args:
1322
+ urls (List[str]): URLs to scrape
1323
+ formats (Optional[List[Literal]]): Content formats to retrieve
1324
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1325
+ include_tags (Optional[List[str]]): HTML tags to include
1326
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1327
+ only_main_content (Optional[bool]): Extract main content only
1328
+ wait_for (Optional[int]): Wait time in milliseconds
1329
+ timeout (Optional[int]): Request timeout in milliseconds
1330
+ location (Optional[LocationConfig]): Location configuration
1331
+ mobile (Optional[bool]): Use mobile user agent
1332
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1333
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1334
+ block_ads (Optional[bool]): Block advertisements
1335
+ proxy (Optional[Literal]): Proxy type to use
1336
+ extract (Optional[JsonConfig]): Content extraction config
1337
+ json_options (Optional[JsonConfig]): JSON extraction config
1338
+ actions (Optional[List[Union]]): Actions to perform
1339
+ agent (Optional[AgentOptions]): Agent configuration
1340
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1341
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1342
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1343
+ **kwargs: Additional parameters to pass to the API
1344
+
1345
+ Returns:
1346
+ V1BatchScrapeStatusResponse with:
1347
+ * Scraping status and progress
1348
+ * Scraped content for each URL
1349
+ * Success/error information
1350
+
1351
+ Raises:
1352
+ Exception: If batch scrape fails
1353
+ """
1354
+ # Validate any additional kwargs
1355
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1356
+
1357
+ scrape_params = {}
1358
+
1359
+ # Add individual parameters
1360
+ if formats is not None:
1361
+ scrape_params['formats'] = formats
1362
+ if headers is not None:
1363
+ scrape_params['headers'] = headers
1364
+ if include_tags is not None:
1365
+ scrape_params['includeTags'] = include_tags
1366
+ if exclude_tags is not None:
1367
+ scrape_params['excludeTags'] = exclude_tags
1368
+ if only_main_content is not None:
1369
+ scrape_params['onlyMainContent'] = only_main_content
1370
+ if wait_for is not None:
1371
+ scrape_params['waitFor'] = wait_for
1372
+ if timeout is not None:
1373
+ scrape_params['timeout'] = timeout
1374
+ if location is not None:
1375
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
1376
+ if mobile is not None:
1377
+ scrape_params['mobile'] = mobile
1378
+ if skip_tls_verification is not None:
1379
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1380
+ if remove_base64_images is not None:
1381
+ scrape_params['removeBase64Images'] = remove_base64_images
1382
+ if block_ads is not None:
1383
+ scrape_params['blockAds'] = block_ads
1384
+ if proxy is not None:
1385
+ scrape_params['proxy'] = proxy
1386
+ if extract is not None:
1387
+ extract = self._ensure_schema_dict(extract)
1388
+ if isinstance(extract, dict) and "schema" in extract:
1389
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1390
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
1391
+ if json_options is not None:
1392
+ json_options = self._ensure_schema_dict(json_options)
1393
+ if isinstance(json_options, dict) and "schema" in json_options:
1394
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1395
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
1396
+ if actions:
1397
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
1398
+ if agent is not None:
1399
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
1400
+ if max_concurrency is not None:
1401
+ scrape_params['maxConcurrency'] = max_concurrency
1402
+ if zero_data_retention is not None:
1403
+ scrape_params['zeroDataRetention'] = zero_data_retention
1404
+
1405
+ # Add any additional kwargs
1406
+ scrape_params.update(kwargs)
1407
+
1408
+ # Create final params object
1409
+ final_params = V1ScrapeParams(**scrape_params)
1410
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1411
+ params_dict['urls'] = urls
1412
+ params_dict['origin'] = f"python-sdk@{version}"
1413
+
1414
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1415
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1416
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1417
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1418
+
1419
+ # Make request
1420
+ headers = self._prepare_headers(idempotency_key)
1421
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1422
+
1423
+ if response.status_code == 200:
1424
+ try:
1425
+ id = response.json().get('id')
1426
+ except:
1427
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1428
+ return self._monitor_job_status(id, headers, poll_interval)
1429
+ else:
1430
+ self._handle_error(response, 'start batch scrape job')
1431
+
1432
+ def async_batch_scrape_urls(
1433
+ self,
1434
+ urls: List[str],
1435
+ *,
1436
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1437
+ headers: Optional[Dict[str, str]] = None,
1438
+ include_tags: Optional[List[str]] = None,
1439
+ exclude_tags: Optional[List[str]] = None,
1440
+ only_main_content: Optional[bool] = None,
1441
+ wait_for: Optional[int] = None,
1442
+ timeout: Optional[int] = 30000,
1443
+ location: Optional[V1LocationConfig] = None,
1444
+ mobile: Optional[bool] = None,
1445
+ skip_tls_verification: Optional[bool] = None,
1446
+ remove_base64_images: Optional[bool] = None,
1447
+ block_ads: Optional[bool] = None,
1448
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1449
+ extract: Optional[V1JsonConfig] = None,
1450
+ json_options: Optional[V1JsonConfig] = None,
1451
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
1452
+ agent: Optional[V1AgentOptions] = None,
1453
+ max_concurrency: Optional[int] = None,
1454
+ idempotency_key: Optional[str] = None,
1455
+ zero_data_retention: Optional[bool] = None,
1456
+ **kwargs
1457
+ ) -> V1BatchScrapeResponse:
1458
+ """
1459
+ Initiate a batch scrape job asynchronously.
1460
+
1461
+ Args:
1462
+ urls (List[str]): URLs to scrape
1463
+ formats (Optional[List[Literal]]): Content formats to retrieve
1464
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1465
+ include_tags (Optional[List[str]]): HTML tags to include
1466
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1467
+ only_main_content (Optional[bool]): Extract main content only
1468
+ wait_for (Optional[int]): Wait time in milliseconds
1469
+ timeout (Optional[int]): Request timeout in milliseconds
1470
+ location (Optional[LocationConfig]): Location configuration
1471
+ mobile (Optional[bool]): Use mobile user agent
1472
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1473
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1474
+ block_ads (Optional[bool]): Block advertisements
1475
+ proxy (Optional[Literal]): Proxy type to use
1476
+ extract (Optional[JsonConfig]): Content extraction config
1477
+ json_options (Optional[JsonConfig]): JSON extraction config
1478
+ actions (Optional[List[Union]]): Actions to perform
1479
+ agent (Optional[AgentOptions]): Agent configuration
1480
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1481
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1482
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1483
+ **kwargs: Additional parameters to pass to the API
1484
+
1485
+ Returns:
1486
+ V1BatchScrapeResponse with:
1487
+ * success - Whether job started successfully
1488
+ * id - Unique identifier for the job
1489
+ * url - Status check URL
1490
+ * error - Error message if start failed
1491
+
1492
+ Raises:
1493
+ Exception: If job initiation fails
1494
+ """
1495
+ # Validate any additional kwargs
1496
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1497
+
1498
+ scrape_params = {}
1499
+
1500
+ # Add individual parameters
1501
+ if formats is not None:
1502
+ scrape_params['formats'] = formats
1503
+ if headers is not None:
1504
+ scrape_params['headers'] = headers
1505
+ if include_tags is not None:
1506
+ scrape_params['includeTags'] = include_tags
1507
+ if exclude_tags is not None:
1508
+ scrape_params['excludeTags'] = exclude_tags
1509
+ if only_main_content is not None:
1510
+ scrape_params['onlyMainContent'] = only_main_content
1511
+ if wait_for is not None:
1512
+ scrape_params['waitFor'] = wait_for
1513
+ if timeout is not None:
1514
+ scrape_params['timeout'] = timeout
1515
+ if location is not None:
1516
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
1517
+ if mobile is not None:
1518
+ scrape_params['mobile'] = mobile
1519
+ if skip_tls_verification is not None:
1520
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1521
+ if remove_base64_images is not None:
1522
+ scrape_params['removeBase64Images'] = remove_base64_images
1523
+ if block_ads is not None:
1524
+ scrape_params['blockAds'] = block_ads
1525
+ if proxy is not None:
1526
+ scrape_params['proxy'] = proxy
1527
+ if extract is not None:
1528
+ extract = self._ensure_schema_dict(extract)
1529
+ if isinstance(extract, dict) and "schema" in extract:
1530
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1531
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
1532
+ if json_options is not None:
1533
+ json_options = self._ensure_schema_dict(json_options)
1534
+ if isinstance(json_options, dict) and "schema" in json_options:
1535
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1536
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
1537
+ if actions:
1538
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
1539
+ if agent is not None:
1540
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
1541
+ if max_concurrency is not None:
1542
+ scrape_params['maxConcurrency'] = max_concurrency
1543
+ if zero_data_retention is not None:
1544
+ scrape_params['zeroDataRetention'] = zero_data_retention
1545
+
1546
+ # Add any additional kwargs
1547
+ scrape_params.update(kwargs)
1548
+
1549
+ # Create final params object
1550
+ final_params = V1ScrapeParams(**scrape_params)
1551
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1552
+ params_dict['urls'] = urls
1553
+ params_dict['origin'] = f"python-sdk@{version}"
1554
+
1555
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1556
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1557
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1558
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1559
+
1560
+ # Make request
1561
+ headers = self._prepare_headers(idempotency_key)
1562
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1563
+
1564
+ if response.status_code == 200:
1565
+ try:
1566
+ return V1BatchScrapeResponse(**response.json())
1567
+ except:
1568
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1569
+ else:
1570
+ self._handle_error(response, 'start batch scrape job')
1571
+
1572
+ def batch_scrape_urls_and_watch(
1573
+ self,
1574
+ urls: List[str],
1575
+ *,
1576
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1577
+ headers: Optional[Dict[str, str]] = None,
1578
+ include_tags: Optional[List[str]] = None,
1579
+ exclude_tags: Optional[List[str]] = None,
1580
+ only_main_content: Optional[bool] = None,
1581
+ wait_for: Optional[int] = None,
1582
+ timeout: Optional[int] = 30000,
1583
+ location: Optional[V1LocationConfig] = None,
1584
+ mobile: Optional[bool] = None,
1585
+ skip_tls_verification: Optional[bool] = None,
1586
+ remove_base64_images: Optional[bool] = None,
1587
+ block_ads: Optional[bool] = None,
1588
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1589
+ extract: Optional[V1JsonConfig] = None,
1590
+ json_options: Optional[V1JsonConfig] = None,
1591
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
1592
+ agent: Optional[V1AgentOptions] = None,
1593
+ max_concurrency: Optional[int] = None,
1594
+ zero_data_retention: Optional[bool] = None,
1595
+ idempotency_key: Optional[str] = None,
1596
+ **kwargs
1597
+ ) -> 'V1CrawlWatcher':
1598
+ """
1599
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1600
+
1601
+ Args:
1602
+ urls (List[str]): URLs to scrape
1603
+ formats (Optional[List[Literal]]): Content formats to retrieve
1604
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1605
+ include_tags (Optional[List[str]]): HTML tags to include
1606
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1607
+ only_main_content (Optional[bool]): Extract main content only
1608
+ wait_for (Optional[int]): Wait time in milliseconds
1609
+ timeout (Optional[int]): Request timeout in milliseconds
1610
+ location (Optional[LocationConfig]): Location configuration
1611
+ mobile (Optional[bool]): Use mobile user agent
1612
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1613
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1614
+ block_ads (Optional[bool]): Block advertisements
1615
+ proxy (Optional[Literal]): Proxy type to use
1616
+ extract (Optional[JsonConfig]): Content extraction config
1617
+ json_options (Optional[JsonConfig]): JSON extraction config
1618
+ actions (Optional[List[Union]]): Actions to perform
1619
+ agent (Optional[AgentOptions]): Agent configuration
1620
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1621
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1622
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1623
+ **kwargs: Additional parameters to pass to the API
1624
+
1625
+ Returns:
1626
+ V1CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1627
+
1628
+ Raises:
1629
+ Exception: If batch scrape job fails to start
1630
+ """
1631
+ # Validate any additional kwargs
1632
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1633
+
1634
+ scrape_params = {}
1635
+
1636
+ # Add individual parameters
1637
+ if formats is not None:
1638
+ scrape_params['formats'] = formats
1639
+ if headers is not None:
1640
+ scrape_params['headers'] = headers
1641
+ if include_tags is not None:
1642
+ scrape_params['includeTags'] = include_tags
1643
+ if exclude_tags is not None:
1644
+ scrape_params['excludeTags'] = exclude_tags
1645
+ if only_main_content is not None:
1646
+ scrape_params['onlyMainContent'] = only_main_content
1647
+ if wait_for is not None:
1648
+ scrape_params['waitFor'] = wait_for
1649
+ if timeout is not None:
1650
+ scrape_params['timeout'] = timeout
1651
+ if location is not None:
1652
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
1653
+ if mobile is not None:
1654
+ scrape_params['mobile'] = mobile
1655
+ if skip_tls_verification is not None:
1656
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1657
+ if remove_base64_images is not None:
1658
+ scrape_params['removeBase64Images'] = remove_base64_images
1659
+ if block_ads is not None:
1660
+ scrape_params['blockAds'] = block_ads
1661
+ if proxy is not None:
1662
+ scrape_params['proxy'] = proxy
1663
+ if extract is not None:
1664
+ extract = self._ensure_schema_dict(extract)
1665
+ if isinstance(extract, dict) and "schema" in extract:
1666
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1667
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
1668
+ if json_options is not None:
1669
+ json_options = self._ensure_schema_dict(json_options)
1670
+ if isinstance(json_options, dict) and "schema" in json_options:
1671
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1672
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
1673
+ if actions:
1674
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
1675
+ if agent is not None:
1676
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
1677
+ if max_concurrency is not None:
1678
+ scrape_params['maxConcurrency'] = max_concurrency
1679
+ if zero_data_retention is not None:
1680
+ scrape_params['zeroDataRetention'] = zero_data_retention
1681
+
1682
+ # Add any additional kwargs
1683
+ scrape_params.update(kwargs)
1684
+
1685
+ # Create final params object
1686
+ final_params = V1ScrapeParams(**scrape_params)
1687
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
1688
+ params_dict['urls'] = urls
1689
+ params_dict['origin'] = f"python-sdk@{version}"
1690
+
1691
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1692
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1693
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1694
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1695
+
1696
+ # Make request
1697
+ headers = self._prepare_headers(idempotency_key)
1698
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1699
+
1700
+ if response.status_code == 200:
1701
+ try:
1702
+ crawl_response = V1BatchScrapeResponse(**response.json())
1703
+ if crawl_response.success and crawl_response.id:
1704
+ return V1CrawlWatcher(crawl_response.id, self)
1705
+ else:
1706
+ raise Exception("Batch scrape job failed to start")
1707
+ except:
1708
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1709
+ else:
1710
+ self._handle_error(response, 'start batch scrape job')
1711
+
1712
+ def check_batch_scrape_status(self, id: str) -> V1BatchScrapeStatusResponse:
1713
+ """
1714
+ Check the status of a batch scrape job using the Firecrawl API.
1715
+
1716
+ Args:
1717
+ id (str): The ID of the batch scrape job.
1718
+
1719
+ Returns:
1720
+ V1BatchScrapeStatusResponse: The status of the batch scrape job.
1721
+
1722
+ Raises:
1723
+ Exception: If the status check request fails.
1724
+ """
1725
+ endpoint = f'/v1/batch/scrape/{id}'
1726
+
1727
+ headers = self._prepare_headers()
1728
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1729
+ if response.status_code == 200:
1730
+ try:
1731
+ status_data = response.json()
1732
+ except:
1733
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1734
+ if status_data['status'] == 'completed':
1735
+ if 'data' in status_data:
1736
+ data = status_data['data']
1737
+ while 'next' in status_data:
1738
+ if len(status_data['data']) == 0:
1739
+ break
1740
+ next_url = status_data.get('next')
1741
+ if not next_url:
1742
+ logger.warning("Expected 'next' URL is missing.")
1743
+ break
1744
+ try:
1745
+ status_response = self._get_request(next_url, headers)
1746
+ if status_response.status_code != 200:
1747
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1748
+ break
1749
+ try:
1750
+ next_data = status_response.json()
1751
+ except:
1752
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1753
+ data.extend(next_data.get('data', []))
1754
+ status_data = next_data
1755
+ except Exception as e:
1756
+ logger.error(f"Error during pagination request: {e}")
1757
+ break
1758
+ status_data['data'] = data
1759
+
1760
+ return V1BatchScrapeStatusResponse(**{
1761
+ 'success': False if 'error' in status_data else True,
1762
+ 'status': status_data.get('status'),
1763
+ 'total': status_data.get('total'),
1764
+ 'completed': status_data.get('completed'),
1765
+ 'creditsUsed': status_data.get('creditsUsed'),
1766
+ 'expiresAt': status_data.get('expiresAt'),
1767
+ 'data': status_data.get('data'),
1768
+ 'next': status_data.get('next'),
1769
+ 'error': status_data.get('error')
1770
+ })
1771
+ else:
1772
+ self._handle_error(response, 'check batch scrape status')
1773
+
1774
+ def check_batch_scrape_errors(self, id: str) -> V1CrawlErrorsResponse:
1775
+ """
1776
+ Returns information about batch scrape errors.
1777
+
1778
+ Args:
1779
+ id (str): The ID of the crawl job.
1780
+
1781
+ Returns:
1782
+ V1CrawlErrorsResponse containing:
1783
+ * errors (List[Dict[str, str]]): List of errors with fields:
1784
+ * id (str): Error ID
1785
+ * timestamp (str): When the error occurred
1786
+ * url (str): URL that caused the error
1787
+ * error (str): Error message
1788
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1789
+
1790
+ Raises:
1791
+ Exception: If the error check request fails
1792
+ """
1793
+ headers = self._prepare_headers()
1794
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1795
+ if response.status_code == 200:
1796
+ try:
1797
+ return V1CrawlErrorsResponse(**response.json())
1798
+ except:
1799
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1800
+ else:
1801
+ self._handle_error(response, "check batch scrape errors")
1802
+
1803
+ def extract(
1804
+ self,
1805
+ urls: Optional[List[str]] = None,
1806
+ *,
1807
+ prompt: Optional[str] = None,
1808
+ schema: Optional[Any] = None,
1809
+ system_prompt: Optional[str] = None,
1810
+ allow_external_links: Optional[bool] = False,
1811
+ enable_web_search: Optional[bool] = False,
1812
+ show_sources: Optional[bool] = False,
1813
+ agent: Optional[Dict[str, Any]] = None,
1814
+ **kwargs) -> V1ExtractResponse[Any]:
1815
+ """
1816
+ Extract structured information from URLs.
1817
+
1818
+ Args:
1819
+ urls (Optional[List[str]]): URLs to extract from
1820
+ prompt (Optional[str]): Custom extraction prompt
1821
+ schema (Optional[Any]): JSON schema/Pydantic model
1822
+ system_prompt (Optional[str]): System context
1823
+ allow_external_links (Optional[bool]): Follow external links
1824
+ enable_web_search (Optional[bool]): Enable web search
1825
+ show_sources (Optional[bool]): Include source URLs
1826
+ agent (Optional[Dict[str, Any]]): Agent configuration
1827
+ **kwargs: Additional parameters to pass to the API
1828
+
1829
+ Returns:
1830
+ V1ExtractResponse[Any] with:
1831
+ * success (bool): Whether request succeeded
1832
+ * data (Optional[Any]): Extracted data matching schema
1833
+ * error (Optional[str]): Error message if any
1834
+
1835
+ Raises:
1836
+ ValueError: If prompt/schema missing or extraction fails
1837
+ """
1838
+ # Validate any additional kwargs
1839
+ self._validate_kwargs(kwargs, "extract")
1840
+
1841
+ headers = self._prepare_headers()
1842
+
1843
+ if not prompt and not schema:
1844
+ raise ValueError("Either prompt or schema is required")
1845
+
1846
+ if not urls and not prompt:
1847
+ raise ValueError("Either urls or prompt is required")
1848
+
1849
+ if schema:
1850
+ schema = self._ensure_schema_dict(schema)
1851
+
1852
+ request_data = {
1853
+ 'urls': urls or [],
1854
+ 'allowExternalLinks': allow_external_links,
1855
+ 'enableWebSearch': enable_web_search,
1856
+ 'showSources': show_sources,
1857
+ 'schema': schema,
1858
+ 'origin': f'python-sdk@{get_version()}'
1859
+ }
1860
+
1861
+ # Only add prompt and systemPrompt if they exist
1862
+ if prompt:
1863
+ request_data['prompt'] = prompt
1864
+ if system_prompt:
1865
+ request_data['systemPrompt'] = system_prompt
1866
+
1867
+ if agent:
1868
+ request_data['agent'] = agent
1869
+
1870
+ # Add any additional kwargs
1871
+ request_data.update(kwargs)
1872
+
1873
+ try:
1874
+ # Send the initial extract request
1875
+ response = self._post_request(
1876
+ f'{self.api_url}/v1/extract',
1877
+ request_data,
1878
+ headers
1879
+ )
1880
+ if response.status_code == 200:
1881
+ try:
1882
+ data = response.json()
1883
+ except:
1884
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1885
+ if data['success']:
1886
+ job_id = data.get('id')
1887
+ if not job_id:
1888
+ raise Exception('Job ID not returned from extract request.')
1889
+
1890
+ # Poll for the extract status
1891
+ while True:
1892
+ status_response = self._get_request(
1893
+ f'{self.api_url}/v1/extract/{job_id}',
1894
+ headers
1895
+ )
1896
+ if status_response.status_code == 200:
1897
+ try:
1898
+ status_data = status_response.json()
1899
+ except:
1900
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1901
+ if status_data['status'] == 'completed':
1902
+ return V1ExtractResponse(**status_data)
1903
+ elif status_data['status'] in ['failed', 'cancelled']:
1904
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1905
+ else:
1906
+ self._handle_error(status_response, "extract-status")
1907
+
1908
+ time.sleep(2) # Polling interval
1909
+ else:
1910
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1911
+ else:
1912
+ self._handle_error(response, "extract")
1913
+ except Exception as e:
1914
+ raise ValueError(str(e), 500)
1915
+
1916
+ return V1ExtractResponse(success=False, error="Internal server error.")
1917
+
1918
+ def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]:
1919
+ """
1920
+ Retrieve the status of an extract job.
1921
+
1922
+ Args:
1923
+ job_id (str): The ID of the extract job.
1924
+
1925
+ Returns:
1926
+ ExtractResponse[Any]: The status of the extract job.
1927
+
1928
+ Raises:
1929
+ ValueError: If there is an error retrieving the status.
1930
+ """
1931
+ headers = self._prepare_headers()
1932
+ try:
1933
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1934
+ if response.status_code == 200:
1935
+ try:
1936
+ return V1ExtractResponse(**response.json())
1937
+ except:
1938
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1939
+ else:
1940
+ self._handle_error(response, "get extract status")
1941
+ except Exception as e:
1942
+ raise ValueError(str(e), 500)
1943
+
1944
+ def async_extract(
1945
+ self,
1946
+ urls: Optional[List[str]] = None,
1947
+ *,
1948
+ prompt: Optional[str] = None,
1949
+ schema: Optional[Any] = None,
1950
+ system_prompt: Optional[str] = None,
1951
+ allow_external_links: Optional[bool] = False,
1952
+ enable_web_search: Optional[bool] = False,
1953
+ show_sources: Optional[bool] = False,
1954
+ agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
1955
+ """
1956
+ Initiate an asynchronous extract job.
1957
+
1958
+ Args:
1959
+ urls (List[str]): URLs to extract information from
1960
+ prompt (Optional[str]): Custom extraction prompt
1961
+ schema (Optional[Any]): JSON schema/Pydantic model
1962
+ system_prompt (Optional[str]): System context
1963
+ allow_external_links (Optional[bool]): Follow external links
1964
+ enable_web_search (Optional[bool]): Enable web search
1965
+ show_sources (Optional[bool]): Include source URLs
1966
+ agent (Optional[Dict[str, Any]]): Agent configuration
1967
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1968
+
1969
+ Returns:
1970
+ ExtractResponse[Any] with:
1971
+ * success (bool): Whether request succeeded
1972
+ * data (Optional[Any]): Extracted data matching schema
1973
+ * error (Optional[str]): Error message if any
1974
+
1975
+ Raises:
1976
+ ValueError: If job initiation fails
1977
+ """
1978
+ headers = self._prepare_headers()
1979
+
1980
+ schema = schema
1981
+ if schema:
1982
+ schema = self._ensure_schema_dict(schema)
1983
+
1984
+ request_data = {
1985
+ 'urls': urls,
1986
+ 'allowExternalLinks': allow_external_links,
1987
+ 'enableWebSearch': enable_web_search,
1988
+ 'showSources': show_sources,
1989
+ 'schema': schema,
1990
+ 'origin': f'python-sdk@{version}'
1991
+ }
1992
+
1993
+ if prompt:
1994
+ request_data['prompt'] = prompt
1995
+ if system_prompt:
1996
+ request_data['systemPrompt'] = system_prompt
1997
+ if agent:
1998
+ request_data['agent'] = agent
1999
+
2000
+ try:
2001
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
2002
+ if response.status_code == 200:
2003
+ try:
2004
+ return V1ExtractResponse(**response.json())
2005
+ except:
2006
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2007
+ else:
2008
+ self._handle_error(response, "async extract")
2009
+ except Exception as e:
2010
+ raise ValueError(str(e), 500)
2011
+
2012
+ def generate_llms_text(
2013
+ self,
2014
+ url: str,
2015
+ *,
2016
+ max_urls: Optional[int] = None,
2017
+ show_full_text: Optional[bool] = None,
2018
+ cache: Optional[bool] = None,
2019
+ experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextStatusResponse:
2020
+ """
2021
+ Generate LLMs.txt for a given URL and poll until completion.
2022
+
2023
+ Args:
2024
+ url (str): Target URL to generate LLMs.txt from
2025
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2026
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2027
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2028
+ experimental_stream (Optional[bool]): Enable experimental streaming
2029
+
2030
+ Returns:
2031
+ GenerateLLMsTextStatusResponse with:
2032
+ * Generated LLMs.txt content
2033
+ * Full version if requested
2034
+ * Generation status
2035
+ * Success/error information
2036
+
2037
+ Raises:
2038
+ Exception: If generation fails
2039
+ """
2040
+ params = V1GenerateLLMsTextParams(
2041
+ maxUrls=max_urls,
2042
+ showFullText=show_full_text,
2043
+ cache=cache,
2044
+ __experimental_stream=experimental_stream
2045
+ )
2046
+
2047
+ response = self.async_generate_llms_text(
2048
+ url,
2049
+ max_urls=max_urls,
2050
+ show_full_text=show_full_text,
2051
+ cache=cache,
2052
+ experimental_stream=experimental_stream
2053
+ )
2054
+
2055
+ if not response.success or not response.id:
2056
+ return V1GenerateLLMsTextStatusResponse(
2057
+ success=False,
2058
+ error='Failed to start LLMs.txt generation',
2059
+ status='failed',
2060
+ expiresAt=''
2061
+ )
2062
+
2063
+ job_id = response.id
2064
+ while True:
2065
+ status = self.check_generate_llms_text_status(job_id)
2066
+
2067
+ if status.status == 'completed':
2068
+ return status
2069
+ elif status.status == 'failed':
2070
+ return status
2071
+ elif status.status != 'processing':
2072
+ return V1GenerateLLMsTextStatusResponse(
2073
+ success=False,
2074
+ error='LLMs.txt generation job terminated unexpectedly',
2075
+ status='failed',
2076
+ expiresAt=''
2077
+ )
2078
+
2079
+ time.sleep(2) # Polling interval
2080
+
2081
+ def async_generate_llms_text(
2082
+ self,
2083
+ url: str,
2084
+ *,
2085
+ max_urls: Optional[int] = None,
2086
+ show_full_text: Optional[bool] = None,
2087
+ cache: Optional[bool] = None,
2088
+ experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextResponse:
2089
+ """
2090
+ Initiate an asynchronous LLMs.txt generation operation.
2091
+
2092
+ Args:
2093
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2094
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2095
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2096
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2097
+ experimental_stream (Optional[bool]): Enable experimental streaming
2098
+
2099
+ Returns:
2100
+ GenerateLLMsTextResponse: A response containing:
2101
+ * success (bool): Whether the generation initiation was successful
2102
+ * id (str): The unique identifier for the generation job
2103
+ * error (str, optional): Error message if initiation failed
2104
+
2105
+ Raises:
2106
+ Exception: If the generation job initiation fails.
2107
+ """
2108
+ params = V1GenerateLLMsTextParams(
2109
+ maxUrls=max_urls,
2110
+ showFullText=show_full_text,
2111
+ cache=cache,
2112
+ __experimental_stream=experimental_stream
2113
+ )
2114
+
2115
+ headers = self._prepare_headers()
2116
+ json_data = {'url': url, **params.dict(by_alias=True, exclude_none=True)}
2117
+ json_data['origin'] = f"python-sdk@{version}"
2118
+
2119
+ try:
2120
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2121
+ response = req.json()
2122
+ print("json_data", json_data)
2123
+ print("response", response)
2124
+ if response.get('success'):
2125
+ try:
2126
+ return V1GenerateLLMsTextResponse(**response)
2127
+ except:
2128
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2129
+ else:
2130
+ self._handle_error(response, 'start LLMs.txt generation')
2131
+ except Exception as e:
2132
+ raise ValueError(str(e))
2133
+
2134
+ return V1GenerateLLMsTextResponse(
2135
+ success=False,
2136
+ error='Internal server error'
2137
+ )
2138
+
2139
+ def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusResponse:
2140
+ """
2141
+ Check the status of a LLMs.txt generation operation.
2142
+
2143
+ Args:
2144
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
2145
+
2146
+ Returns:
2147
+ GenerateLLMsTextStatusResponse: A response containing:
2148
+ * success (bool): Whether the generation was successful
2149
+ * status (str): Status of generation ("processing", "completed", "failed")
2150
+ * data (Dict[str, str], optional): Generated text with fields:
2151
+ * llmstxt (str): Generated LLMs.txt content
2152
+ * llmsfulltxt (str, optional): Full version if requested
2153
+ * error (str, optional): Error message if generation failed
2154
+ * expiresAt (str): When the generated data expires
2155
+
2156
+ Raises:
2157
+ Exception: If the status check fails.
2158
+ """
2159
+ headers = self._prepare_headers()
2160
+ try:
2161
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2162
+ if response.status_code == 200:
2163
+ try:
2164
+ json_data = response.json()
2165
+ return V1GenerateLLMsTextStatusResponse(**json_data)
2166
+ except Exception as e:
2167
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2168
+ elif response.status_code == 404:
2169
+ raise Exception('LLMs.txt generation job not found')
2170
+ else:
2171
+ self._handle_error(response, 'check LLMs.txt generation status')
2172
+ except Exception as e:
2173
+ raise ValueError(str(e))
2174
+
2175
+ return V1GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2176
+
2177
+ def _prepare_headers(
2178
+ self,
2179
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
2180
+ """
2181
+ Prepare the headers for API requests.
2182
+
2183
+ Args:
2184
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2185
+
2186
+ Returns:
2187
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2188
+ """
2189
+ if idempotency_key:
2190
+ return {
2191
+ 'Content-Type': 'application/json',
2192
+ 'Authorization': f'Bearer {self.api_key}',
2193
+ 'x-idempotency-key': idempotency_key
2194
+ }
2195
+
2196
+ return {
2197
+ 'Content-Type': 'application/json',
2198
+ 'Authorization': f'Bearer {self.api_key}',
2199
+ }
2200
+
2201
+ def _post_request(
2202
+ self,
2203
+ url: str,
2204
+ data: Dict[str, Any],
2205
+ headers: Dict[str, str],
2206
+ retries: int = 3,
2207
+ backoff_factor: float = 0.5) -> requests.Response:
2208
+ """
2209
+ Make a POST request with retries.
2210
+
2211
+ Args:
2212
+ url (str): The URL to send the POST request to.
2213
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2214
+ headers (Dict[str, str]): The headers to include in the POST request.
2215
+ retries (int): Number of retries for the request.
2216
+ backoff_factor (float): Backoff factor for retries.
2217
+
2218
+ Returns:
2219
+ requests.Response: The response from the POST request.
2220
+
2221
+ Raises:
2222
+ requests.RequestException: If the request fails after the specified retries.
2223
+ """
2224
+ for attempt in range(retries):
2225
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] / 1000.0 + 5) if "timeout" in data and data["timeout"] is not None else None))
2226
+ if response.status_code == 502:
2227
+ time.sleep(backoff_factor * (2 ** attempt))
2228
+ else:
2229
+ return response
2230
+ return response
2231
+
2232
+ def _get_request(
2233
+ self,
2234
+ url: str,
2235
+ headers: Dict[str, str],
2236
+ retries: int = 3,
2237
+ backoff_factor: float = 0.5) -> requests.Response:
2238
+ """
2239
+ Make a GET request with retries.
2240
+
2241
+ Args:
2242
+ url (str): The URL to send the GET request to.
2243
+ headers (Dict[str, str]): The headers to include in the GET request.
2244
+ retries (int): Number of retries for the request.
2245
+ backoff_factor (float): Backoff factor for retries.
2246
+
2247
+ Returns:
2248
+ requests.Response: The response from the GET request.
2249
+
2250
+ Raises:
2251
+ requests.RequestException: If the request fails after the specified retries.
2252
+ """
2253
+ for attempt in range(retries):
2254
+ response = requests.get(url, headers=headers)
2255
+ if response.status_code == 502:
2256
+ time.sleep(backoff_factor * (2 ** attempt))
2257
+ else:
2258
+ return response
2259
+ return response
2260
+
2261
+ def _delete_request(
2262
+ self,
2263
+ url: str,
2264
+ headers: Dict[str, str],
2265
+ retries: int = 3,
2266
+ backoff_factor: float = 0.5) -> requests.Response:
2267
+ """
2268
+ Make a DELETE request with retries.
2269
+
2270
+ Args:
2271
+ url (str): The URL to send the DELETE request to.
2272
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2273
+ retries (int): Number of retries for the request.
2274
+ backoff_factor (float): Backoff factor for retries.
2275
+
2276
+ Returns:
2277
+ requests.Response: The response from the DELETE request.
2278
+
2279
+ Raises:
2280
+ requests.RequestException: If the request fails after the specified retries.
2281
+ """
2282
+ for attempt in range(retries):
2283
+ response = requests.delete(url, headers=headers)
2284
+ if response.status_code == 502:
2285
+ time.sleep(backoff_factor * (2 ** attempt))
2286
+ else:
2287
+ return response
2288
+ return response
2289
+
2290
+ def _monitor_job_status(
2291
+ self,
2292
+ id: str,
2293
+ headers: Dict[str, str],
2294
+ poll_interval: int) -> V1CrawlStatusResponse:
2295
+ """
2296
+ Monitor the status of a crawl job until completion.
2297
+
2298
+ Args:
2299
+ id (str): The ID of the crawl job.
2300
+ headers (Dict[str, str]): The headers to include in the status check requests.
2301
+ poll_interval (int): Seconds between status checks.
2302
+
2303
+ Returns:
2304
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2305
+
2306
+ Raises:
2307
+ Exception: If the job fails or an error occurs during status checks.
2308
+ """
2309
+ while True:
2310
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2311
+
2312
+ status_response = self._get_request(api_url, headers)
2313
+ if status_response.status_code == 200:
2314
+ try:
2315
+ status_data = status_response.json()
2316
+ except:
2317
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2318
+ if status_data['status'] == 'completed':
2319
+ if 'data' in status_data:
2320
+ data = status_data['data']
2321
+ while 'next' in status_data:
2322
+ if len(status_data['data']) == 0:
2323
+ break
2324
+ status_response = self._get_request(status_data['next'], headers)
2325
+ try:
2326
+ status_data = status_response.json()
2327
+ except:
2328
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2329
+ data.extend(status_data.get('data', []))
2330
+ status_data['data'] = data
2331
+ return V1CrawlStatusResponse(**status_data)
2332
+ else:
2333
+ raise Exception('Crawl job completed but no data was returned')
2334
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2335
+ poll_interval=max(poll_interval,2)
2336
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2337
+ else:
2338
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2339
+ else:
2340
+ self._handle_error(status_response, 'check crawl status')
2341
+
2342
+ def _handle_error(
2343
+ self,
2344
+ response: requests.Response,
2345
+ action: str) -> None:
2346
+ """
2347
+ Handle errors from API responses.
2348
+
2349
+ Args:
2350
+ response (requests.Response): The response object from the API request.
2351
+ action (str): Description of the action that was being performed.
2352
+
2353
+ Raises:
2354
+ Exception: An exception with a message containing the status code and error details from the response.
2355
+ """
2356
+ try:
2357
+ response_json = response.json()
2358
+ error_message = response_json.get('error', 'No error message provided.')
2359
+ error_details = response_json.get('details', 'No additional error details provided.')
2360
+ except:
2361
+ # If we can't parse JSON, provide a helpful error message with response content
2362
+ try:
2363
+ response_text = response.text[:500] # Limit to first 500 chars
2364
+ if response_text.strip():
2365
+ error_message = f"Server returned non-JSON response: {response_text}"
2366
+ error_details = f"Full response status: {response.status_code}"
2367
+ else:
2368
+ error_message = f"Server returned empty response with status {response.status_code}"
2369
+ error_details = "No additional details available"
2370
+ except ValueError:
2371
+ error_message = f"Server returned unreadable response with status {response.status_code}"
2372
+ error_details = "No additional details available"
2373
+
2374
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2375
+
2376
+ # Raise an HTTPError with the custom message and attach the response
2377
+ raise requests.exceptions.HTTPError(message, response=response)
2378
+
2379
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2380
+ """
2381
+ Generate a standardized error message based on HTTP status code.
2382
+
2383
+ Args:
2384
+ status_code (int): The HTTP status code from the response
2385
+ action (str): Description of the action that was being performed
2386
+ error_message (str): The error message from the API response
2387
+ error_details (str): Additional error details from the API response
2388
+
2389
+ Returns:
2390
+ str: A formatted error message
2391
+ """
2392
+ if status_code == 402:
2393
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2394
+ elif status_code == 403:
2395
+ return f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2396
+ elif status_code == 408:
2397
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2398
+ elif status_code == 409:
2399
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2400
+ elif status_code == 500:
2401
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2402
+ else:
2403
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2404
+
2405
+ def deep_research(
2406
+ self,
2407
+ query: str,
2408
+ *,
2409
+ max_depth: Optional[int] = None,
2410
+ time_limit: Optional[int] = None,
2411
+ max_urls: Optional[int] = None,
2412
+ analysis_prompt: Optional[str] = None,
2413
+ system_prompt: Optional[str] = None,
2414
+ __experimental_stream_steps: Optional[bool] = None,
2415
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2416
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> V1DeepResearchStatusResponse:
2417
+ """
2418
+ Initiates a deep research operation on a given query and polls until completion.
2419
+
2420
+ Args:
2421
+ query (str): Research query or topic to investigate
2422
+ max_depth (Optional[int]): Maximum depth of research exploration
2423
+ time_limit (Optional[int]): Time limit in seconds for research
2424
+ max_urls (Optional[int]): Maximum number of URLs to process
2425
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2426
+ system_prompt (Optional[str]): Custom system prompt
2427
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2428
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2429
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2430
+
2431
+ Returns:
2432
+ DeepResearchStatusResponse containing:
2433
+ * success (bool): Whether research completed successfully
2434
+ * status (str): Current state (processing/completed/failed)
2435
+ * error (Optional[str]): Error message if failed
2436
+ * id (str): Unique identifier for the research job
2437
+ * data (Any): Research findings and analysis
2438
+ * sources (List[Dict]): List of discovered sources
2439
+ * activities (List[Dict]): Research progress log
2440
+ * summaries (List[str]): Generated research summaries
2441
+
2442
+ Raises:
2443
+ Exception: If research fails
2444
+ """
2445
+ research_params = {}
2446
+ if max_depth is not None:
2447
+ research_params['maxDepth'] = max_depth
2448
+ if time_limit is not None:
2449
+ research_params['timeLimit'] = time_limit
2450
+ if max_urls is not None:
2451
+ research_params['maxUrls'] = max_urls
2452
+ if analysis_prompt is not None:
2453
+ research_params['analysisPrompt'] = analysis_prompt
2454
+ if system_prompt is not None:
2455
+ research_params['systemPrompt'] = system_prompt
2456
+ if __experimental_stream_steps is not None:
2457
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2458
+ research_params = V1DeepResearchParams(**research_params)
2459
+
2460
+ response = self.async_deep_research(
2461
+ query,
2462
+ max_depth=max_depth,
2463
+ time_limit=time_limit,
2464
+ max_urls=max_urls,
2465
+ analysis_prompt=analysis_prompt,
2466
+ system_prompt=system_prompt
2467
+ )
2468
+ if not response.get('success') or 'id' not in response:
2469
+ return response
2470
+
2471
+ job_id = response['id']
2472
+ last_activity_count = 0
2473
+ last_source_count = 0
2474
+
2475
+ while True:
2476
+ status = self.check_deep_research_status(job_id)
2477
+
2478
+ if on_activity and 'activities' in status:
2479
+ new_activities = status['activities'][last_activity_count:]
2480
+ for activity in new_activities:
2481
+ on_activity(activity)
2482
+ last_activity_count = len(status['activities'])
2483
+
2484
+ if on_source and 'sources' in status:
2485
+ new_sources = status['sources'][last_source_count:]
2486
+ for source in new_sources:
2487
+ on_source(source)
2488
+ last_source_count = len(status['sources'])
2489
+
2490
+ if status['status'] == 'completed':
2491
+ return status
2492
+ elif status['status'] == 'failed':
2493
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2494
+ elif status['status'] != 'processing':
2495
+ break
2496
+
2497
+ time.sleep(2) # Polling interval
2498
+
2499
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2500
+
2501
+ def async_deep_research(
2502
+ self,
2503
+ query: str,
2504
+ *,
2505
+ max_depth: Optional[int] = None,
2506
+ time_limit: Optional[int] = None,
2507
+ max_urls: Optional[int] = None,
2508
+ analysis_prompt: Optional[str] = None,
2509
+ system_prompt: Optional[str] = None,
2510
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2511
+ """
2512
+ Initiates an asynchronous deep research operation.
2513
+
2514
+ Args:
2515
+ query (str): Research query or topic to investigate
2516
+ max_depth (Optional[int]): Maximum depth of research exploration
2517
+ time_limit (Optional[int]): Time limit in seconds for research
2518
+ max_urls (Optional[int]): Maximum number of URLs to process
2519
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2520
+ system_prompt (Optional[str]): Custom system prompt
2521
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2522
+
2523
+ Returns:
2524
+ Dict[str, Any]: A response containing:
2525
+ * success (bool): Whether the research initiation was successful
2526
+ * id (str): The unique identifier for the research job
2527
+ * error (str, optional): Error message if initiation failed
2528
+
2529
+ Raises:
2530
+ Exception: If the research initiation fails.
2531
+ """
2532
+ research_params = {}
2533
+ if max_depth is not None:
2534
+ research_params['maxDepth'] = max_depth
2535
+ if time_limit is not None:
2536
+ research_params['timeLimit'] = time_limit
2537
+ if max_urls is not None:
2538
+ research_params['maxUrls'] = max_urls
2539
+ if analysis_prompt is not None:
2540
+ research_params['analysisPrompt'] = analysis_prompt
2541
+ if system_prompt is not None:
2542
+ research_params['systemPrompt'] = system_prompt
2543
+ if __experimental_stream_steps is not None:
2544
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2545
+ research_params = V1DeepResearchParams(**research_params)
2546
+
2547
+ headers = self._prepare_headers()
2548
+
2549
+ json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)}
2550
+ json_data['origin'] = f"python-sdk@{version}"
2551
+
2552
+ # Handle json options schema if present
2553
+ if 'jsonOptions' in json_data:
2554
+ json_opts = json_data['jsonOptions']
2555
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2556
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2557
+
2558
+ try:
2559
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2560
+ if response.status_code == 200:
2561
+ try:
2562
+ return response.json()
2563
+ except:
2564
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2565
+ else:
2566
+ self._handle_error(response, 'start deep research')
2567
+ except Exception as e:
2568
+ raise ValueError(str(e))
2569
+
2570
+ return {'success': False, 'error': 'Internal server error'}
2571
+
2572
+ def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse:
2573
+ """
2574
+ Check the status of a deep research operation.
2575
+
2576
+ Args:
2577
+ id (str): The ID of the deep research operation.
2578
+
2579
+ Returns:
2580
+ DeepResearchResponse containing:
2581
+
2582
+ Status:
2583
+ * success - Whether research completed successfully
2584
+ * status - Current state (processing/completed/failed)
2585
+ * error - Error message if failed
2586
+
2587
+ Results:
2588
+ * id - Unique identifier for the research job
2589
+ * data - Research findings and analysis
2590
+ * sources - List of discovered sources
2591
+ * activities - Research progress log
2592
+ * summaries - Generated research summaries
2593
+
2594
+ Raises:
2595
+ Exception: If the status check fails.
2596
+ """
2597
+ headers = self._prepare_headers()
2598
+ try:
2599
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2600
+ if response.status_code == 200:
2601
+ try:
2602
+ return response.json()
2603
+ except:
2604
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2605
+ elif response.status_code == 404:
2606
+ raise Exception('Deep research job not found')
2607
+ else:
2608
+ self._handle_error(response, 'check deep research status')
2609
+ except Exception as e:
2610
+ raise ValueError(str(e))
2611
+
2612
+ return {'success': False, 'error': 'Internal server error'}
2613
+
2614
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2615
+ """
2616
+ Validate additional keyword arguments before they are passed to the API.
2617
+ This provides early validation before the Pydantic model validation.
2618
+
2619
+ Args:
2620
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2621
+ method_name (str): Name of the method these kwargs are for
2622
+
2623
+ Raises:
2624
+ ValueError: If kwargs contain invalid or unsupported parameters
2625
+ """
2626
+ if not kwargs:
2627
+ return
2628
+
2629
+ # Known parameter mappings for each method
2630
+ method_params = {
2631
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2632
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2633
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "max_age", "integration"},
2634
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
2635
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2636
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2637
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url", "integration"},
2638
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout", "integration"},
2639
+ "extract": {"prompt", "schema", "system_prompt", "allow_external_links", "enable_web_search", "show_sources", "agent", "integration"},
2640
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2641
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2642
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2643
+ "actions", "agent", "webhook"},
2644
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2645
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2646
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2647
+ "actions", "agent", "webhook"},
2648
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2649
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2650
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2651
+ "actions", "agent", "webhook"}
2652
+ }
2653
+
2654
+ # Get allowed parameters for this method
2655
+ allowed_params = method_params.get(method_name, set())
2656
+
2657
+ # Check for unknown parameters
2658
+ unknown_params = set(kwargs.keys()) - allowed_params
2659
+ if unknown_params:
2660
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2661
+
2662
+ # Additional type validation can be added here if needed
2663
+ # For now, we rely on Pydantic models for detailed type validation
2664
+
2665
+ def _ensure_schema_dict(self, schema):
2666
+ """
2667
+ Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2668
+ """
2669
+ if schema is None:
2670
+ return schema
2671
+ if isinstance(schema, type):
2672
+ # Pydantic v1/v2 model class
2673
+ if hasattr(schema, 'model_json_schema'):
2674
+ return schema.model_json_schema()
2675
+ elif hasattr(schema, 'schema'):
2676
+ return schema.schema()
2677
+ if isinstance(schema, dict):
2678
+ return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2679
+ if isinstance(schema, (list, tuple)):
2680
+ return [self._ensure_schema_dict(v) for v in schema]
2681
+ return schema
2682
+
2683
+ class V1CrawlWatcher:
2684
+ """
2685
+ A class to watch and handle crawl job events via WebSocket connection.
2686
+
2687
+ Attributes:
2688
+ id (str): The ID of the crawl job to watch
2689
+ app (V1FirecrawlApp): The V1FirecrawlApp instance
2690
+ data (List[Dict[str, Any]]): List of crawled documents/data
2691
+ status (str): Current status of the crawl job
2692
+ ws_url (str): WebSocket URL for the crawl job
2693
+ event_handlers (dict): Dictionary of event type to list of handler functions
2694
+ """
2695
+ def __init__(self, id: str, app: V1FirecrawlApp):
2696
+ self.id = id
2697
+ self.app = app
2698
+ self.data: List[Dict[str, Any]] = []
2699
+ self.status = "scraping"
2700
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2701
+ self.event_handlers = {
2702
+ 'done': [],
2703
+ 'error': [],
2704
+ 'document': []
2705
+ }
2706
+
2707
+ async def connect(self) -> None:
2708
+ """
2709
+ Establishes WebSocket connection and starts listening for messages.
2710
+ """
2711
+ async with websockets.connect(
2712
+ self.ws_url,
2713
+ max_size=None,
2714
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2715
+ ) as websocket:
2716
+ await self._listen(websocket)
2717
+
2718
+ async def _listen(self, websocket) -> None:
2719
+ """
2720
+ Listens for incoming WebSocket messages and handles them.
2721
+
2722
+ Args:
2723
+ websocket: The WebSocket connection object
2724
+ """
2725
+ async for message in websocket:
2726
+ msg = json.loads(message)
2727
+ await self._handle_message(msg)
2728
+
2729
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2730
+ """
2731
+ Adds an event handler function for a specific event type.
2732
+
2733
+ Args:
2734
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2735
+ handler (Callable): Function to handle the event
2736
+ """
2737
+ if event_type in self.event_handlers:
2738
+ self.event_handlers[event_type].append(handler)
2739
+
2740
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2741
+ """
2742
+ Dispatches an event to all registered handlers for that event type.
2743
+
2744
+ Args:
2745
+ event_type (str): Type of event to dispatch
2746
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2747
+ """
2748
+ if event_type in self.event_handlers:
2749
+ for handler in self.event_handlers[event_type]:
2750
+ handler(detail)
2751
+
2752
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2753
+ """
2754
+ Handles incoming WebSocket messages based on their type.
2755
+
2756
+ Args:
2757
+ msg (Dict[str, Any]): The message to handle
2758
+ """
2759
+ if msg['type'] == 'done':
2760
+ self.status = 'completed'
2761
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2762
+ elif msg['type'] == 'error':
2763
+ self.status = 'failed'
2764
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2765
+ elif msg['type'] == 'catchup':
2766
+ self.status = msg['data']['status']
2767
+ self.data.extend(msg['data'].get('data', []))
2768
+ for doc in self.data:
2769
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2770
+ elif msg['type'] == 'document':
2771
+ self.data.append(msg['data'])
2772
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2773
+
2774
+ class AsyncV1FirecrawlApp(V1FirecrawlApp):
2775
+ """
2776
+ Asynchronous version of V1FirecrawlApp that implements async methods using aiohttp.
2777
+ Provides non-blocking alternatives to all V1FirecrawlApp operations.
2778
+ """
2779
+
2780
+ def __init__(self, api_key: str, api_url: str = "https://api.firecrawl.dev"):
2781
+ # Reuse V1 helpers (_prepare_headers, _validate_kwargs, _ensure_schema_dict, _get_error_message)
2782
+ super().__init__(api_key=api_key, api_url=api_url)
2783
+
2784
+ async def _async_request(
2785
+ self,
2786
+ method: str,
2787
+ url: str,
2788
+ headers: Dict[str, str],
2789
+ data: Optional[Dict[str, Any]] = None,
2790
+ retries: int = 3,
2791
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2792
+ """
2793
+ Generic async request method with exponential backoff retry logic.
2794
+
2795
+ Args:
2796
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2797
+ url (str): The URL to send the request to.
2798
+ headers (Dict[str, str]): Headers to include in the request.
2799
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2800
+ retries (int): Maximum number of retry attempts (default: 3).
2801
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2802
+ Delay will be backoff_factor * (2 ** retry_count).
2803
+
2804
+ Returns:
2805
+ Dict[str, Any]: The parsed JSON response from the server.
2806
+
2807
+ Raises:
2808
+ aiohttp.ClientError: If the request fails after all retries.
2809
+ Exception: If max retries are exceeded or other errors occur.
2810
+ """
2811
+ async with aiohttp.ClientSession() as session:
2812
+ for attempt in range(retries):
2813
+ try:
2814
+ async with session.request(
2815
+ method=method, url=url, headers=headers, json=data
2816
+ ) as response:
2817
+ if response.status == 502:
2818
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2819
+ continue
2820
+ if response.status >= 300:
2821
+ await self._handle_error(response, f"make {method} request")
2822
+ return await response.json()
2823
+ except aiohttp.ClientError as e:
2824
+ if attempt == retries - 1:
2825
+ raise e
2826
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2827
+ raise Exception("Max retries exceeded")
2828
+
2829
+ async def _async_post_request(
2830
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2831
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2832
+ """
2833
+ Make an async POST request with exponential backoff retry logic.
2834
+
2835
+ Args:
2836
+ url (str): The URL to send the POST request to.
2837
+ data (Dict[str, Any]): The JSON data to include in the request body.
2838
+ headers (Dict[str, str]): Headers to include in the request.
2839
+ retries (int): Maximum number of retry attempts (default: 3).
2840
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2841
+ Delay will be backoff_factor * (2 ** retry_count).
2842
+
2843
+ Returns:
2844
+ Dict[str, Any]: The parsed JSON response from the server.
2845
+
2846
+ Raises:
2847
+ aiohttp.ClientError: If the request fails after all retries.
2848
+ Exception: If max retries are exceeded or other errors occur.
2849
+ """
2850
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2851
+
2852
+ async def _async_get_request(
2853
+ self, url: str, headers: Dict[str, str],
2854
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2855
+ """
2856
+ Make an async GET request with exponential backoff retry logic.
2857
+
2858
+ Args:
2859
+ url (str): The URL to send the GET request to.
2860
+ headers (Dict[str, str]): Headers to include in the request.
2861
+ retries (int): Maximum number of retry attempts (default: 3).
2862
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2863
+ Delay will be backoff_factor * (2 ** retry_count).
2864
+
2865
+ Returns:
2866
+ Dict[str, Any]: The parsed JSON response from the server.
2867
+
2868
+ Raises:
2869
+ aiohttp.ClientError: If the request fails after all retries.
2870
+ Exception: If max retries are exceeded or other errors occur.
2871
+ """
2872
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2873
+
2874
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2875
+ """
2876
+ Handle errors from async API responses with detailed error messages.
2877
+
2878
+ Args:
2879
+ response (aiohttp.ClientResponse): The response object from the failed request
2880
+ action (str): Description of the action that was being attempted
2881
+
2882
+ Raises:
2883
+ aiohttp.ClientError: With a detailed error message based on the response status:
2884
+ - 402: Payment Required
2885
+ - 408: Request Timeout
2886
+ - 409: Conflict
2887
+ - 500: Internal Server Error
2888
+ - Other: Unexpected error with status code
2889
+ """
2890
+ try:
2891
+ error_data = await response.json()
2892
+ error_message = error_data.get('error', 'No error message provided.')
2893
+ error_details = error_data.get('details', 'No additional error details provided.')
2894
+ except:
2895
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2896
+
2897
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2898
+
2899
+ raise aiohttp.ClientError(message)
2900
+
2901
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2902
+ """
2903
+ Generate a standardized error message based on HTTP status code for async operations.
2904
+
2905
+ Args:
2906
+ status_code (int): The HTTP status code from the response
2907
+ action (str): Description of the action that was being performed
2908
+ error_message (str): The error message from the API response
2909
+ error_details (str): Additional error details from the API response
2910
+
2911
+ Returns:
2912
+ str: A formatted error message
2913
+ """
2914
+ return self._get_error_message(status_code, action, error_message, error_details)
2915
+
2916
+ async def crawl_url_and_watch(
2917
+ self,
2918
+ url: str,
2919
+ params: Optional[V1CrawlParams] = None,
2920
+ idempotency_key: Optional[str] = None) -> 'AsyncV1CrawlWatcher':
2921
+ """
2922
+ Initiate an async crawl job and return an AsyncV1CrawlWatcher to monitor progress via WebSocket.
2923
+
2924
+ Args:
2925
+ url (str): Target URL to start crawling from
2926
+ params (Optional[V1CrawlParams]): See V1CrawlParams model for configuration:
2927
+ URL Discovery:
2928
+ * includePaths - Patterns of URLs to include
2929
+ * excludePaths - Patterns of URLs to exclude
2930
+ * maxDepth - Maximum crawl depth
2931
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2932
+ * limit - Maximum pages to crawl
2933
+
2934
+ Link Following:
2935
+ * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2936
+ * crawlEntireDomain - Follow parent directory links
2937
+ * allowExternalLinks - Follow external domain links
2938
+ * ignoreSitemap - Skip sitemap.xml processing
2939
+
2940
+ Advanced:
2941
+ * scrapeOptions - Page scraping configuration
2942
+ * webhook - Notification webhook settings
2943
+ * deduplicateSimilarURLs - Remove similar URLs
2944
+ * ignoreQueryParameters - Ignore URL parameters
2945
+ * regexOnFullURL - Apply regex to full URLs
2946
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2947
+
2948
+ Returns:
2949
+ AsyncV1CrawlWatcher: An instance to monitor the crawl job via WebSocket
2950
+
2951
+ Raises:
2952
+ Exception: If crawl job fails to start
2953
+ """
2954
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2955
+ if crawl_response.get('success') and 'id' in crawl_response:
2956
+ return AsyncV1CrawlWatcher(crawl_response['id'], self)
2957
+ else:
2958
+ raise Exception("Crawl job failed to start")
2959
+
2960
+ async def batch_scrape_urls_and_watch(
2961
+ self,
2962
+ urls: List[str],
2963
+ params: Optional[V1ScrapeParams] = None,
2964
+ idempotency_key: Optional[str] = None) -> 'AsyncV1CrawlWatcher':
2965
+ """
2966
+ Initiate an async batch scrape job and return an AsyncV1CrawlWatcher to monitor progress.
2967
+
2968
+ Args:
2969
+ urls (List[str]): List of URLs to scrape
2970
+ params (Optional[V1ScrapeParams]): See V1ScrapeParams model for configuration:
2971
+
2972
+ Content Options:
2973
+ * formats - Content formats to retrieve
2974
+ * includeTags - HTML tags to include
2975
+ * excludeTags - HTML tags to exclude
2976
+ * onlyMainContent - Extract main content only
2977
+
2978
+ Request Options:
2979
+ * headers - Custom HTTP headers
2980
+ * timeout - Request timeout (ms)
2981
+ * mobile - Use mobile user agent
2982
+ * proxy - Proxy type
2983
+
2984
+ Extraction Options:
2985
+ * extract - Content extraction config
2986
+ * jsonOptions - JSON extraction config
2987
+ * actions - Actions to perform
2988
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2989
+
2990
+ Returns:
2991
+ AsyncV1CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2992
+
2993
+ Raises:
2994
+ Exception: If batch scrape job fails to start
2995
+ """
2996
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2997
+ if batch_response.get('success') and 'id' in batch_response:
2998
+ return AsyncV1CrawlWatcher(batch_response['id'], self)
2999
+ else:
3000
+ raise Exception("Batch scrape job failed to start")
3001
+
3002
+ async def scrape_url(
3003
+ self,
3004
+ url: str,
3005
+ *,
3006
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
3007
+ headers: Optional[Dict[str, str]] = None,
3008
+ include_tags: Optional[List[str]] = None,
3009
+ exclude_tags: Optional[List[str]] = None,
3010
+ only_main_content: Optional[bool] = None,
3011
+ wait_for: Optional[int] = None,
3012
+ timeout: Optional[int] = 30000,
3013
+ location: Optional[V1LocationConfig] = None,
3014
+ mobile: Optional[bool] = None,
3015
+ skip_tls_verification: Optional[bool] = None,
3016
+ remove_base64_images: Optional[bool] = None,
3017
+ block_ads: Optional[bool] = None,
3018
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3019
+ parse_pdf: Optional[bool] = None,
3020
+ extract: Optional[V1JsonConfig] = None,
3021
+ json_options: Optional[V1JsonConfig] = None,
3022
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
3023
+ **kwargs) -> V1ScrapeResponse[Any]:
3024
+ """
3025
+ Scrape a single URL asynchronously.
3026
+
3027
+ Args:
3028
+ url (str): Target URL to scrape
3029
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
3030
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3031
+ include_tags (Optional[List[str]]): HTML tags to include
3032
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3033
+ only_main_content (Optional[bool]): Extract main content only
3034
+ wait_for (Optional[int]): Wait for a specific element to appear
3035
+ timeout (Optional[int]): Request timeout (ms)
3036
+ location (Optional[V1LocationConfig]): Location configuration
3037
+ mobile (Optional[bool]): Use mobile user agent
3038
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3039
+ remove_base64_images (Optional[bool]): Remove base64 images
3040
+ block_ads (Optional[bool]): Block ads
3041
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
3042
+ extract (Optional[V1JsonConfig]): Content extraction settings
3043
+ json_options (Optional[V1JsonConfig]): JSON extraction settings
3044
+ actions (Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]]): Actions to perform
3045
+ **kwargs: Additional parameters to pass to the API
3046
+
3047
+ Returns:
3048
+ V1ScrapeResponse with:
3049
+ * success - Whether scrape was successful
3050
+ * markdown - Markdown content if requested
3051
+ * html - HTML content if requested
3052
+ * rawHtml - Raw HTML content if requested
3053
+ * links - Extracted links if requested
3054
+ * screenshot - Screenshot if requested
3055
+ * extract - Extracted data if requested
3056
+ * json - JSON data if requested
3057
+ * error - Error message if scrape failed
3058
+
3059
+ Raises:
3060
+ Exception: If scraping fails
3061
+ """
3062
+ # Validate any additional kwargs
3063
+ self._validate_kwargs(kwargs, "scrape_url")
3064
+
3065
+ _headers = self._prepare_headers()
3066
+
3067
+ # Build scrape parameters
3068
+ scrape_params = {
3069
+ 'url': url,
3070
+ 'origin': f"python-sdk@{version}"
3071
+ }
3072
+
3073
+ # Add optional parameters if provided and not None
3074
+ if formats:
3075
+ scrape_params['formats'] = formats
3076
+ if headers:
3077
+ scrape_params['headers'] = headers
3078
+ if include_tags:
3079
+ scrape_params['includeTags'] = include_tags
3080
+ if exclude_tags:
3081
+ scrape_params['excludeTags'] = exclude_tags
3082
+ if only_main_content is not None:
3083
+ scrape_params['onlyMainContent'] = only_main_content
3084
+ if wait_for:
3085
+ scrape_params['waitFor'] = wait_for
3086
+ if timeout:
3087
+ scrape_params['timeout'] = timeout
3088
+ if location:
3089
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
3090
+ if mobile is not None:
3091
+ scrape_params['mobile'] = mobile
3092
+ if skip_tls_verification is not None:
3093
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3094
+ if remove_base64_images is not None:
3095
+ scrape_params['removeBase64Images'] = remove_base64_images
3096
+ if block_ads is not None:
3097
+ scrape_params['blockAds'] = block_ads
3098
+ if proxy:
3099
+ scrape_params['proxy'] = proxy
3100
+ if parse_pdf is not None:
3101
+ scrape_params['parsePDF'] = parse_pdf
3102
+ if extract is not None:
3103
+ extract = self._ensure_schema_dict(extract)
3104
+ if isinstance(extract, dict) and "schema" in extract:
3105
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3106
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
3107
+ if json_options is not None:
3108
+ json_options = self._ensure_schema_dict(json_options)
3109
+ if isinstance(json_options, dict) and "schema" in json_options:
3110
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3111
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
3112
+ if actions:
3113
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
3114
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
3115
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
3116
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3117
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3118
+
3119
+ # Make async request
3120
+ endpoint = f'/v1/scrape'
3121
+ response = await self._async_post_request(
3122
+ f'{self.api_url}{endpoint}',
3123
+ scrape_params,
3124
+ _headers
3125
+ )
3126
+
3127
+ if response.get('success') and 'data' in response:
3128
+ return V1ScrapeResponse(**response['data'])
3129
+ elif "error" in response:
3130
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3131
+ else:
3132
+ # Use the response content directly if possible, otherwise a generic message
3133
+ error_content = response.get('error', str(response))
3134
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
3135
+
3136
+ async def batch_scrape_urls(
3137
+ self,
3138
+ urls: List[str],
3139
+ *,
3140
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3141
+ headers: Optional[Dict[str, str]] = None,
3142
+ include_tags: Optional[List[str]] = None,
3143
+ exclude_tags: Optional[List[str]] = None,
3144
+ only_main_content: Optional[bool] = None,
3145
+ wait_for: Optional[int] = None,
3146
+ timeout: Optional[int] = 30000,
3147
+ location: Optional[V1LocationConfig] = None,
3148
+ mobile: Optional[bool] = None,
3149
+ skip_tls_verification: Optional[bool] = None,
3150
+ remove_base64_images: Optional[bool] = None,
3151
+ block_ads: Optional[bool] = None,
3152
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3153
+ extract: Optional[V1JsonConfig] = None,
3154
+ json_options: Optional[V1JsonConfig] = None,
3155
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
3156
+ agent: Optional[V1AgentOptions] = None,
3157
+ poll_interval: Optional[int] = 2,
3158
+ idempotency_key: Optional[str] = None,
3159
+ **kwargs
3160
+ ) -> V1BatchScrapeStatusResponse:
3161
+ """
3162
+ Asynchronously scrape multiple URLs and monitor until completion.
3163
+
3164
+ Args:
3165
+ urls (List[str]): URLs to scrape
3166
+ formats (Optional[List[Literal]]): Content formats to retrieve
3167
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3168
+ include_tags (Optional[List[str]]): HTML tags to include
3169
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3170
+ only_main_content (Optional[bool]): Extract main content only
3171
+ wait_for (Optional[int]): Wait time in milliseconds
3172
+ timeout (Optional[int]): Request timeout in milliseconds
3173
+ location (Optional[LocationConfig]): Location configuration
3174
+ mobile (Optional[bool]): Use mobile user agent
3175
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3176
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3177
+ block_ads (Optional[bool]): Block advertisements
3178
+ proxy (Optional[Literal]): Proxy type to use
3179
+ extract (Optional[JsonConfig]): Content extraction config
3180
+ json_options (Optional[JsonConfig]): JSON extraction config
3181
+ actions (Optional[List[Union]]): Actions to perform
3182
+ agent (Optional[AgentOptions]): Agent configuration
3183
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3184
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3185
+ **kwargs: Additional parameters to pass to the API
3186
+
3187
+ Returns:
3188
+ V1BatchScrapeStatusResponse with:
3189
+ * Scraping status and progress
3190
+ * Scraped content for each URL
3191
+ * Success/error information
3192
+
3193
+ Raises:
3194
+ Exception: If batch scrape fails
3195
+ """
3196
+ # Validate any additional kwargs
3197
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
3198
+
3199
+ scrape_params = {}
3200
+
3201
+ # Add individual parameters
3202
+ if formats is not None:
3203
+ scrape_params['formats'] = formats
3204
+ if headers is not None:
3205
+ scrape_params['headers'] = headers
3206
+ if include_tags is not None:
3207
+ scrape_params['includeTags'] = include_tags
3208
+ if exclude_tags is not None:
3209
+ scrape_params['excludeTags'] = exclude_tags
3210
+ if only_main_content is not None:
3211
+ scrape_params['onlyMainContent'] = only_main_content
3212
+ if wait_for is not None:
3213
+ scrape_params['waitFor'] = wait_for
3214
+ if timeout is not None:
3215
+ scrape_params['timeout'] = timeout
3216
+ if location is not None:
3217
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
3218
+ if mobile is not None:
3219
+ scrape_params['mobile'] = mobile
3220
+ if skip_tls_verification is not None:
3221
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3222
+ if remove_base64_images is not None:
3223
+ scrape_params['removeBase64Images'] = remove_base64_images
3224
+ if block_ads is not None:
3225
+ scrape_params['blockAds'] = block_ads
3226
+ if proxy is not None:
3227
+ scrape_params['proxy'] = proxy
3228
+ if extract is not None:
3229
+ extract = self._ensure_schema_dict(extract)
3230
+ if isinstance(extract, dict) and "schema" in extract:
3231
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3232
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
3233
+ if json_options is not None:
3234
+ json_options = self._ensure_schema_dict(json_options)
3235
+ if isinstance(json_options, dict) and "schema" in json_options:
3236
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3237
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
3238
+ if actions is not None:
3239
+ scrape_params['actions'] = [action.dict(by_alias=True, exclude_none=True) for action in actions]
3240
+ if agent is not None:
3241
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
3242
+
3243
+ # Add any additional kwargs
3244
+ scrape_params.update(kwargs)
3245
+
3246
+ # Create final params object
3247
+ final_params = V1ScrapeParams(**scrape_params)
3248
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3249
+ params_dict['urls'] = urls
3250
+ params_dict['origin'] = f"python-sdk@{version}"
3251
+
3252
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3253
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3254
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3255
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3256
+
3257
+ # Make request
3258
+ headers = self._prepare_headers(idempotency_key)
3259
+ response = await self._async_post_request(
3260
+ f'{self.api_url}/v1/batch/scrape',
3261
+ params_dict,
3262
+ headers
3263
+ )
3264
+
3265
+ if response.get('success'):
3266
+ try:
3267
+ id = response.get('id')
3268
+ except:
3269
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3270
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3271
+ else:
3272
+ self._handle_error(response, 'start batch scrape job')
3273
+
3274
+
3275
+ async def async_batch_scrape_urls(
3276
+ self,
3277
+ urls: List[str],
3278
+ *,
3279
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3280
+ headers: Optional[Dict[str, str]] = None,
3281
+ include_tags: Optional[List[str]] = None,
3282
+ exclude_tags: Optional[List[str]] = None,
3283
+ only_main_content: Optional[bool] = None,
3284
+ wait_for: Optional[int] = None,
3285
+ timeout: Optional[int] = 30000,
3286
+ location: Optional[V1LocationConfig] = None,
3287
+ mobile: Optional[bool] = None,
3288
+ skip_tls_verification: Optional[bool] = None,
3289
+ remove_base64_images: Optional[bool] = None,
3290
+ block_ads: Optional[bool] = None,
3291
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3292
+ extract: Optional[V1JsonConfig] = None,
3293
+ json_options: Optional[V1JsonConfig] = None,
3294
+ actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
3295
+ agent: Optional[V1AgentOptions] = None,
3296
+ zero_data_retention: Optional[bool] = None,
3297
+ idempotency_key: Optional[str] = None,
3298
+ **kwargs
3299
+ ) -> V1BatchScrapeResponse:
3300
+ """
3301
+ Initiate a batch scrape job asynchronously.
3302
+
3303
+ Args:
3304
+ urls (List[str]): URLs to scrape
3305
+ formats (Optional[List[Literal]]): Content formats to retrieve
3306
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3307
+ include_tags (Optional[List[str]]): HTML tags to include
3308
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3309
+ only_main_content (Optional[bool]): Extract main content only
3310
+ wait_for (Optional[int]): Wait time in milliseconds
3311
+ timeout (Optional[int]): Request timeout in milliseconds
3312
+ location (Optional[LocationConfig]): Location configuration
3313
+ mobile (Optional[bool]): Use mobile user agent
3314
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3315
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3316
+ block_ads (Optional[bool]): Block advertisements
3317
+ proxy (Optional[Literal]): Proxy type to use
3318
+ extract (Optional[JsonConfig]): Content extraction config
3319
+ json_options (Optional[JsonConfig]): JSON extraction config
3320
+ actions (Optional[List[Union]]): Actions to perform
3321
+ agent (Optional[AgentOptions]): Agent configuration
3322
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
3323
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3324
+ **kwargs: Additional parameters to pass to the API
3325
+
3326
+ Returns:
3327
+ V1BatchScrapeResponse with:
3328
+ * success - Whether job started successfully
3329
+ * id - Unique identifier for the job
3330
+ * url - Status check URL
3331
+ * error - Error message if start failed
3332
+
3333
+ Raises:
3334
+ Exception: If job initiation fails
3335
+ """
3336
+ # Validate any additional kwargs
3337
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3338
+
3339
+ scrape_params = {}
3340
+
3341
+ # Add individual parameters
3342
+ if formats is not None:
3343
+ scrape_params['formats'] = formats
3344
+ if headers is not None:
3345
+ scrape_params['headers'] = headers
3346
+ if include_tags is not None:
3347
+ scrape_params['includeTags'] = include_tags
3348
+ if exclude_tags is not None:
3349
+ scrape_params['excludeTags'] = exclude_tags
3350
+ if only_main_content is not None:
3351
+ scrape_params['onlyMainContent'] = only_main_content
3352
+ if wait_for is not None:
3353
+ scrape_params['waitFor'] = wait_for
3354
+ if timeout is not None:
3355
+ scrape_params['timeout'] = timeout
3356
+ if location is not None:
3357
+ scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
3358
+ if mobile is not None:
3359
+ scrape_params['mobile'] = mobile
3360
+ if skip_tls_verification is not None:
3361
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3362
+ if remove_base64_images is not None:
3363
+ scrape_params['removeBase64Images'] = remove_base64_images
3364
+ if block_ads is not None:
3365
+ scrape_params['blockAds'] = block_ads
3366
+ if proxy is not None:
3367
+ scrape_params['proxy'] = proxy
3368
+ if extract is not None:
3369
+ extract = self._ensure_schema_dict(extract)
3370
+ if isinstance(extract, dict) and "schema" in extract:
3371
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3372
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
3373
+ if json_options is not None:
3374
+ json_options = self._ensure_schema_dict(json_options)
3375
+ if isinstance(json_options, dict) and "schema" in json_options:
3376
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3377
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
3378
+ if actions:
3379
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
3380
+ if agent is not None:
3381
+ scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
3382
+ if zero_data_retention is not None:
3383
+ scrape_params['zeroDataRetention'] = zero_data_retention
3384
+
3385
+ # Add any additional kwargs
3386
+ scrape_params.update(kwargs)
3387
+
3388
+ # Create final params object
3389
+ final_params = V1ScrapeParams(**scrape_params)
3390
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3391
+ params_dict['urls'] = urls
3392
+ params_dict['origin'] = f"python-sdk@{version}"
3393
+
3394
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3395
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3396
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3397
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3398
+
3399
+ # Make request
3400
+ headers = self._prepare_headers(idempotency_key)
3401
+ response = await self._async_post_request(
3402
+ f'{self.api_url}/v1/batch/scrape',
3403
+ params_dict,
3404
+ headers
3405
+ )
3406
+
3407
+ if response.get('status_code') == 200:
3408
+ try:
3409
+ return V1BatchScrapeResponse(**response.json())
3410
+ except:
3411
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3412
+ else:
3413
+ await self._handle_error(response, 'start batch scrape job')
3414
+
3415
+ async def crawl_url(
3416
+ self,
3417
+ url: str,
3418
+ *,
3419
+ include_paths: Optional[List[str]] = None,
3420
+ exclude_paths: Optional[List[str]] = None,
3421
+ max_depth: Optional[int] = None,
3422
+ max_discovery_depth: Optional[int] = None,
3423
+ limit: Optional[int] = None,
3424
+ allow_backward_links: Optional[bool] = None,
3425
+ crawl_entire_domain: Optional[bool] = None,
3426
+ allow_external_links: Optional[bool] = None,
3427
+ ignore_sitemap: Optional[bool] = None,
3428
+ scrape_options: Optional[V1ScrapeOptions] = None,
3429
+ webhook: Optional[Union[str, V1WebhookConfig]] = None,
3430
+ deduplicate_similar_urls: Optional[bool] = None,
3431
+ ignore_query_parameters: Optional[bool] = None,
3432
+ regex_on_full_url: Optional[bool] = None,
3433
+ delay: Optional[int] = None,
3434
+ allow_subdomains: Optional[bool] = None,
3435
+ poll_interval: Optional[int] = 2,
3436
+ idempotency_key: Optional[str] = None,
3437
+ **kwargs
3438
+ ) -> V1CrawlStatusResponse:
3439
+ """
3440
+ Crawl a website starting from a URL.
3441
+
3442
+ Args:
3443
+ url (str): Target URL to start crawling from
3444
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3445
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3446
+ max_depth (Optional[int]): Maximum crawl depth
3447
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3448
+ limit (Optional[int]): Maximum pages to crawl
3449
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3450
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3451
+ allow_external_links (Optional[bool]): Follow external domain links
3452
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3453
+ scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
3454
+ webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
3455
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3456
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3457
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3458
+ delay (Optional[int]): Delay in seconds between scrapes
3459
+ allow_subdomains (Optional[bool]): Follow subdomains
3460
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3461
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3462
+ **kwargs: Additional parameters to pass to the API
3463
+
3464
+ Returns:
3465
+ V1CrawlStatusResponse with:
3466
+ * Crawling status and progress
3467
+ * Crawled page contents
3468
+ * Success/error information
3469
+
3470
+ Raises:
3471
+ Exception: If crawl fails
3472
+ """
3473
+ # Validate any additional kwargs
3474
+ self._validate_kwargs(kwargs, "crawl_url")
3475
+
3476
+ crawl_params = {}
3477
+
3478
+ # Add individual parameters
3479
+ if include_paths is not None:
3480
+ crawl_params['includePaths'] = include_paths
3481
+ if exclude_paths is not None:
3482
+ crawl_params['excludePaths'] = exclude_paths
3483
+ if max_depth is not None:
3484
+ crawl_params['maxDepth'] = max_depth
3485
+ if max_discovery_depth is not None:
3486
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3487
+ if limit is not None:
3488
+ crawl_params['limit'] = limit
3489
+ if crawl_entire_domain is not None:
3490
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3491
+ elif allow_backward_links is not None:
3492
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3493
+ if allow_external_links is not None:
3494
+ crawl_params['allowExternalLinks'] = allow_external_links
3495
+ if ignore_sitemap is not None:
3496
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3497
+ if scrape_options is not None:
3498
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
3499
+ if webhook is not None:
3500
+ crawl_params['webhook'] = webhook
3501
+ if deduplicate_similar_urls is not None:
3502
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3503
+ if ignore_query_parameters is not None:
3504
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3505
+ if regex_on_full_url is not None:
3506
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3507
+ if delay is not None:
3508
+ crawl_params['delay'] = delay
3509
+ if allow_subdomains is not None:
3510
+ crawl_params['allowSubdomains'] = allow_subdomains
3511
+
3512
+ # Add any additional kwargs
3513
+ crawl_params.update(kwargs)
3514
+
3515
+ # Create final params object
3516
+ final_params = V1CrawlParams(**crawl_params)
3517
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3518
+ params_dict['url'] = url
3519
+ params_dict['origin'] = f"python-sdk@{version}"
3520
+ # Make request
3521
+ headers = self._prepare_headers(idempotency_key)
3522
+ response = await self._async_post_request(
3523
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3524
+
3525
+ if response.get('success'):
3526
+ try:
3527
+ id = response.get('id')
3528
+ except:
3529
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3530
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3531
+ else:
3532
+ await self._handle_error(response, 'start crawl job')
3533
+
3534
+
3535
+ async def async_crawl_url(
3536
+ self,
3537
+ url: str,
3538
+ *,
3539
+ include_paths: Optional[List[str]] = None,
3540
+ exclude_paths: Optional[List[str]] = None,
3541
+ max_depth: Optional[int] = None,
3542
+ max_discovery_depth: Optional[int] = None,
3543
+ limit: Optional[int] = None,
3544
+ allow_backward_links: Optional[bool] = None,
3545
+ crawl_entire_domain: Optional[bool] = None,
3546
+ allow_external_links: Optional[bool] = None,
3547
+ ignore_sitemap: Optional[bool] = None,
3548
+ scrape_options: Optional[V1ScrapeOptions] = None,
3549
+ webhook: Optional[Union[str, V1WebhookConfig]] = None,
3550
+ deduplicate_similar_urls: Optional[bool] = None,
3551
+ ignore_query_parameters: Optional[bool] = None,
3552
+ regex_on_full_url: Optional[bool] = None,
3553
+ delay: Optional[int] = None,
3554
+ allow_subdomains: Optional[bool] = None,
3555
+ poll_interval: Optional[int] = 2,
3556
+ idempotency_key: Optional[str] = None,
3557
+ **kwargs
3558
+ ) -> V1CrawlResponse:
3559
+ """
3560
+ Start an asynchronous crawl job.
3561
+
3562
+ Args:
3563
+ url (str): Target URL to start crawling from
3564
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3565
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3566
+ max_depth (Optional[int]): Maximum crawl depth
3567
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3568
+ limit (Optional[int]): Maximum pages to crawl
3569
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3570
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3571
+ allow_external_links (Optional[bool]): Follow external domain links
3572
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3573
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3574
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3575
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3576
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3577
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3578
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3579
+ **kwargs: Additional parameters to pass to the API
3580
+
3581
+ Returns:
3582
+ V1CrawlResponse with:
3583
+ * success - Whether crawl started successfully
3584
+ * id - Unique identifier for the crawl job
3585
+ * url - Status check URL for the crawl
3586
+ * error - Error message if start failed
3587
+
3588
+ Raises:
3589
+ Exception: If crawl initiation fails
3590
+ """
3591
+ crawl_params = {}
3592
+
3593
+ # Add individual parameters
3594
+ if include_paths is not None:
3595
+ crawl_params['includePaths'] = include_paths
3596
+ if exclude_paths is not None:
3597
+ crawl_params['excludePaths'] = exclude_paths
3598
+ if max_depth is not None:
3599
+ crawl_params['maxDepth'] = max_depth
3600
+ if max_discovery_depth is not None:
3601
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3602
+ if limit is not None:
3603
+ crawl_params['limit'] = limit
3604
+ if crawl_entire_domain is not None:
3605
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3606
+ elif allow_backward_links is not None:
3607
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3608
+ if allow_external_links is not None:
3609
+ crawl_params['allowExternalLinks'] = allow_external_links
3610
+ if ignore_sitemap is not None:
3611
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3612
+ if scrape_options is not None:
3613
+ crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
3614
+ if webhook is not None:
3615
+ crawl_params['webhook'] = webhook
3616
+ if deduplicate_similar_urls is not None:
3617
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3618
+ if ignore_query_parameters is not None:
3619
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3620
+ if regex_on_full_url is not None:
3621
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3622
+ if delay is not None:
3623
+ crawl_params['delay'] = delay
3624
+ if allow_subdomains is not None:
3625
+ crawl_params['allowSubdomains'] = allow_subdomains
3626
+
3627
+ # Add any additional kwargs
3628
+ crawl_params.update(kwargs)
3629
+
3630
+ # Create final params object
3631
+ final_params = V1CrawlParams(**crawl_params)
3632
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3633
+ params_dict['url'] = url
3634
+ params_dict['origin'] = f"python-sdk@{version}"
3635
+
3636
+ # Make request
3637
+ headers = self._prepare_headers(idempotency_key)
3638
+ response = await self._async_post_request(
3639
+ f'{self.api_url}/v1/crawl',
3640
+ params_dict,
3641
+ headers
3642
+ )
3643
+
3644
+ if response.get('success'):
3645
+ try:
3646
+ return V1CrawlResponse(**response)
3647
+ except:
3648
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3649
+ else:
3650
+ await self._handle_error(response, 'start crawl job')
3651
+
3652
+ async def check_crawl_status(self, id: str) -> V1CrawlStatusResponse:
3653
+ """
3654
+ Check the status and results of an asynchronous crawl job.
3655
+
3656
+ Args:
3657
+ id (str): Unique identifier for the crawl job
3658
+
3659
+ Returns:
3660
+ V1CrawlStatusResponse containing:
3661
+ Status Information:
3662
+ * status - Current state (scraping/completed/failed/cancelled)
3663
+ * completed - Number of pages crawled
3664
+ * total - Total pages to crawl
3665
+ * creditsUsed - API credits consumed
3666
+ * expiresAt - Data expiration timestamp
3667
+
3668
+ Results:
3669
+ * data - List of crawled documents
3670
+ * next - URL for next page of results (if paginated)
3671
+ * success - Whether status check succeeded
3672
+ * error - Error message if failed
3673
+
3674
+ Raises:
3675
+ Exception: If status check fails
3676
+ """
3677
+ headers = self._prepare_headers()
3678
+ endpoint = f'/v1/crawl/{id}'
3679
+
3680
+ status_data = await self._async_get_request(
3681
+ f'{self.api_url}{endpoint}',
3682
+ headers
3683
+ )
3684
+
3685
+ if status_data.get('status') == 'completed':
3686
+ if 'data' in status_data:
3687
+ data = status_data['data']
3688
+ while 'next' in status_data:
3689
+ if len(status_data['data']) == 0:
3690
+ break
3691
+ next_url = status_data.get('next')
3692
+ if not next_url:
3693
+ logger.warning("Expected 'next' URL is missing.")
3694
+ break
3695
+ next_data = await self._async_get_request(next_url, headers)
3696
+ data.extend(next_data.get('data', []))
3697
+ status_data = next_data
3698
+ status_data['data'] = data
3699
+ # Create V1CrawlStatusResponse object from status data
3700
+ response = V1CrawlStatusResponse(
3701
+ status=status_data.get('status'),
3702
+ total=status_data.get('total'),
3703
+ completed=status_data.get('completed'),
3704
+ creditsUsed=status_data.get('creditsUsed'),
3705
+ expiresAt=status_data.get('expiresAt'),
3706
+ data=status_data.get('data'),
3707
+ success=False if 'error' in status_data else True
3708
+ )
3709
+
3710
+ if 'error' in status_data:
3711
+ response.error = status_data.get('error')
3712
+
3713
+ if 'next' in status_data:
3714
+ response.next = status_data.get('next')
3715
+
3716
+ return response
3717
+
3718
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> V1CrawlStatusResponse:
3719
+ """
3720
+ Monitor the status of an asynchronous job until completion.
3721
+
3722
+ Args:
3723
+ id (str): The ID of the job to monitor
3724
+ headers (Dict[str, str]): Headers to include in status check requests
3725
+ poll_interval (int): Seconds between status checks (default: 2)
3726
+
3727
+ Returns:
3728
+ V1CrawlStatusResponse: The job results if completed successfully
3729
+
3730
+ Raises:
3731
+ Exception: If the job fails or an error occurs during status checks
3732
+ """
3733
+ while True:
3734
+ status_data = await self._async_get_request(
3735
+ f'{self.api_url}/v1/crawl/{id}',
3736
+ headers
3737
+ )
3738
+
3739
+ if status_data.get('status') == 'completed':
3740
+ if 'data' in status_data:
3741
+ data = status_data['data']
3742
+ while 'next' in status_data:
3743
+ if len(status_data['data']) == 0:
3744
+ break
3745
+ next_url = status_data.get('next')
3746
+ if not next_url:
3747
+ logger.warning("Expected 'next' URL is missing.")
3748
+ break
3749
+ next_data = await self._async_get_request(next_url, headers)
3750
+ data.extend(next_data.get('data', []))
3751
+ status_data = next_data
3752
+ status_data['data'] = data
3753
+ return V1CrawlStatusResponse(**status_data)
3754
+ else:
3755
+ raise Exception('Job completed but no data was returned')
3756
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3757
+ await asyncio.sleep(max(poll_interval, 2))
3758
+ else:
3759
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3760
+
3761
+ async def map_url(
3762
+ self,
3763
+ url: str,
3764
+ *,
3765
+ search: Optional[str] = None,
3766
+ ignore_sitemap: Optional[bool] = None,
3767
+ include_subdomains: Optional[bool] = None,
3768
+ sitemap_only: Optional[bool] = None,
3769
+ limit: Optional[int] = None,
3770
+ timeout: Optional[int] = 30000,
3771
+ params: Optional[V1MapParams] = None) -> V1MapResponse:
3772
+ """
3773
+ Asynchronously map and discover links from a URL.
3774
+
3775
+ Args:
3776
+ url (str): Target URL to map
3777
+ params (Optional[V1MapParams]): See V1MapParams model:
3778
+ Discovery Options:
3779
+ * search - Filter pattern for URLs
3780
+ * ignoreSitemap - Skip sitemap.xml
3781
+ * includeSubdomains - Include subdomain links
3782
+ * sitemapOnly - Only use sitemap.xml
3783
+
3784
+ Limits:
3785
+ * limit - Max URLs to return
3786
+ * timeout - Request timeout (ms)
3787
+
3788
+ Returns:
3789
+ V1MapResponse with:
3790
+ * Discovered URLs
3791
+ * Success/error status
3792
+
3793
+ Raises:
3794
+ Exception: If mapping fails
3795
+ """
3796
+ map_params = {}
3797
+ if params:
3798
+ map_params.update(params.dict(by_alias=True, exclude_none=True))
3799
+
3800
+ # Add individual parameters
3801
+ if search is not None:
3802
+ map_params['search'] = search
3803
+ if ignore_sitemap is not None:
3804
+ map_params['ignoreSitemap'] = ignore_sitemap
3805
+ if include_subdomains is not None:
3806
+ map_params['includeSubdomains'] = include_subdomains
3807
+ if sitemap_only is not None:
3808
+ map_params['sitemapOnly'] = sitemap_only
3809
+ if limit is not None:
3810
+ map_params['limit'] = limit
3811
+ if timeout is not None:
3812
+ map_params['timeout'] = timeout
3813
+
3814
+ # Create final params object
3815
+ final_params = V1MapParams(**map_params)
3816
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
3817
+ params_dict['url'] = url
3818
+ params_dict['origin'] = f"python-sdk@{version}"
3819
+
3820
+ # Make request
3821
+ endpoint = f'/v1/map'
3822
+ response = await self._async_post_request(
3823
+ f'{self.api_url}{endpoint}',
3824
+ params_dict,
3825
+ headers={"Authorization": f"Bearer {self.api_key}"}
3826
+ )
3827
+
3828
+ if response.get('success') and 'links' in response:
3829
+ return V1MapResponse(**response)
3830
+ elif 'error' in response:
3831
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3832
+ else:
3833
+ raise Exception(f'Failed to map URL. Error: {response}')
3834
+
3835
+ async def extract(
3836
+ self,
3837
+ urls: Optional[List[str]] = None,
3838
+ *,
3839
+ prompt: Optional[str] = None,
3840
+ schema: Optional[Any] = None,
3841
+ system_prompt: Optional[str] = None,
3842
+ allow_external_links: Optional[bool] = False,
3843
+ enable_web_search: Optional[bool] = False,
3844
+ show_sources: Optional[bool] = False,
3845
+ agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
3846
+
3847
+ """
3848
+ Asynchronously extract structured information from URLs.
3849
+
3850
+ Args:
3851
+ urls (Optional[List[str]]): URLs to extract from
3852
+ prompt (Optional[str]): Custom extraction prompt
3853
+ schema (Optional[Any]): JSON schema/Pydantic model
3854
+ system_prompt (Optional[str]): System context
3855
+ allow_external_links (Optional[bool]): Follow external links
3856
+ enable_web_search (Optional[bool]): Enable web search
3857
+ show_sources (Optional[bool]): Include source URLs
3858
+ agent (Optional[Dict[str, Any]]): Agent configuration
3859
+
3860
+ Returns:
3861
+ V1ExtractResponse with:
3862
+ * Structured data matching schema
3863
+ * Source information if requested
3864
+ * Success/error status
3865
+
3866
+ Raises:
3867
+ ValueError: If prompt/schema missing or extraction fails
3868
+ """
3869
+ headers = self._prepare_headers()
3870
+
3871
+ if not prompt and not schema:
3872
+ raise ValueError("Either prompt or schema is required")
3873
+
3874
+ if not urls and not prompt:
3875
+ raise ValueError("Either urls or prompt is required")
3876
+
3877
+ if schema:
3878
+ schema = self._ensure_schema_dict(schema)
3879
+
3880
+ request_data = {
3881
+ 'urls': urls or [],
3882
+ 'allowExternalLinks': allow_external_links,
3883
+ 'enableWebSearch': enable_web_search,
3884
+ 'showSources': show_sources,
3885
+ 'schema': schema,
3886
+ 'origin': f'python-sdk@{get_version()}'
3887
+ }
3888
+
3889
+ # Only add prompt and systemPrompt if they exist
3890
+ if prompt:
3891
+ request_data['prompt'] = prompt
3892
+ if system_prompt:
3893
+ request_data['systemPrompt'] = system_prompt
3894
+
3895
+ if agent:
3896
+ request_data['agent'] = agent
3897
+
3898
+ response = await self._async_post_request(
3899
+ f'{self.api_url}/v1/extract',
3900
+ request_data,
3901
+ headers
3902
+ )
3903
+
3904
+ if response.get('success'):
3905
+ job_id = response.get('id')
3906
+ if not job_id:
3907
+ raise Exception('Job ID not returned from extract request.')
3908
+
3909
+ while True:
3910
+ status_data = await self._async_get_request(
3911
+ f'{self.api_url}/v1/extract/{job_id}',
3912
+ headers
3913
+ )
3914
+
3915
+ if status_data['status'] == 'completed':
3916
+ return V1ExtractResponse(**status_data)
3917
+ elif status_data['status'] in ['failed', 'cancelled']:
3918
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3919
+
3920
+ await asyncio.sleep(2)
3921
+ else:
3922
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3923
+
3924
+ async def check_batch_scrape_status(self, id: str) -> V1BatchScrapeStatusResponse:
3925
+ """
3926
+ Check the status of an asynchronous batch scrape job.
3927
+
3928
+ Args:
3929
+ id (str): The ID of the batch scrape job
3930
+
3931
+ Returns:
3932
+ V1BatchScrapeStatusResponse containing:
3933
+ Status Information:
3934
+ * status - Current state (scraping/completed/failed/cancelled)
3935
+ * completed - Number of URLs scraped
3936
+ * total - Total URLs to scrape
3937
+ * creditsUsed - API credits consumed
3938
+ * expiresAt - Data expiration timestamp
3939
+
3940
+ Results:
3941
+ * data - List of scraped documents
3942
+ * next - URL for next page of results (if paginated)
3943
+ * success - Whether status check succeeded
3944
+ * error - Error message if failed
3945
+
3946
+ Raises:
3947
+ Exception: If status check fails
3948
+ """
3949
+ headers = self._prepare_headers()
3950
+ endpoint = f'/v1/batch/scrape/{id}'
3951
+
3952
+ status_data = await self._async_get_request(
3953
+ f'{self.api_url}{endpoint}',
3954
+ headers
3955
+ )
3956
+
3957
+ if status_data['status'] == 'completed':
3958
+ if 'data' in status_data:
3959
+ data = status_data['data']
3960
+ while 'next' in status_data:
3961
+ if len(status_data['data']) == 0:
3962
+ break
3963
+ next_url = status_data.get('next')
3964
+ if not next_url:
3965
+ logger.warning("Expected 'next' URL is missing.")
3966
+ break
3967
+ next_data = await self._async_get_request(next_url, headers)
3968
+ data.extend(next_data.get('data', []))
3969
+ status_data = next_data
3970
+ status_data['data'] = data
3971
+
3972
+ response = V1BatchScrapeStatusResponse(
3973
+ status=status_data.get('status'),
3974
+ total=status_data.get('total'),
3975
+ completed=status_data.get('completed'),
3976
+ creditsUsed=status_data.get('creditsUsed'),
3977
+ expiresAt=status_data.get('expiresAt'),
3978
+ data=status_data.get('data')
3979
+ )
3980
+
3981
+ if 'error' in status_data:
3982
+ response['error'] = status_data['error']
3983
+
3984
+ if 'next' in status_data:
3985
+ response['next'] = status_data['next']
3986
+
3987
+ return {
3988
+ 'success': False if 'error' in status_data else True,
3989
+ **response
3990
+ }
3991
+
3992
+ async def check_batch_scrape_errors(self, id: str) -> V1CrawlErrorsResponse:
3993
+ """
3994
+ Get information about errors from an asynchronous batch scrape job.
3995
+
3996
+ Args:
3997
+ id (str): The ID of the batch scrape job
3998
+
3999
+ Returns:
4000
+ V1CrawlErrorsResponse containing:
4001
+ errors (List[Dict[str, str]]): List of errors with fields:
4002
+ * id (str): Error ID
4003
+ * timestamp (str): When the error occurred
4004
+ * url (str): URL that caused the error
4005
+ * error (str): Error message
4006
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
4007
+
4008
+ Raises:
4009
+ Exception: If error check fails
4010
+ """
4011
+ headers = self._prepare_headers()
4012
+ return await self._async_get_request(
4013
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
4014
+ headers
4015
+ )
4016
+
4017
+ async def check_crawl_errors(self, id: str) -> V1CrawlErrorsResponse:
4018
+ """
4019
+ Get information about errors from an asynchronous crawl job.
4020
+
4021
+ Args:
4022
+ id (str): The ID of the crawl job
4023
+
4024
+ Returns:
4025
+ V1CrawlErrorsResponse containing:
4026
+ * errors (List[Dict[str, str]]): List of errors with fields:
4027
+ - id (str): Error ID
4028
+ - timestamp (str): When the error occurred
4029
+ - url (str): URL that caused the error
4030
+ - error (str): Error message
4031
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
4032
+
4033
+ Raises:
4034
+ Exception: If error check fails
4035
+ """
4036
+ headers = self._prepare_headers()
4037
+ return await self._async_get_request(
4038
+ f'{self.api_url}/v1/crawl/{id}/errors',
4039
+ headers
4040
+ )
4041
+
4042
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
4043
+ """
4044
+ Cancel an asynchronous crawl job.
4045
+
4046
+ Args:
4047
+ id (str): The ID of the crawl job to cancel
4048
+
4049
+ Returns:
4050
+ Dict[str, Any] containing:
4051
+ * success (bool): Whether cancellation was successful
4052
+ * error (str, optional): Error message if cancellation failed
4053
+
4054
+ Raises:
4055
+ Exception: If cancellation fails
4056
+ """
4057
+ headers = self._prepare_headers()
4058
+ async with aiohttp.ClientSession() as session:
4059
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
4060
+ return await response.json()
4061
+
4062
+ async def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]:
4063
+ """
4064
+ Check the status of an asynchronous extraction job.
4065
+
4066
+ Args:
4067
+ job_id (str): The ID of the extraction job
4068
+
4069
+ Returns:
4070
+ V1ExtractResponse[Any] with:
4071
+ * success (bool): Whether request succeeded
4072
+ * data (Optional[Any]): Extracted data matching schema
4073
+ * error (Optional[str]): Error message if any
4074
+ * warning (Optional[str]): Warning message if any
4075
+ * sources (Optional[List[str]]): Source URLs if requested
4076
+
4077
+ Raises:
4078
+ ValueError: If status check fails
4079
+ """
4080
+ headers = self._prepare_headers()
4081
+ try:
4082
+ return await self._async_get_request(
4083
+ f'{self.api_url}/v1/extract/{job_id}',
4084
+ headers
4085
+ )
4086
+ except Exception as e:
4087
+ raise ValueError(str(e))
4088
+
4089
+ async def async_extract(
4090
+ self,
4091
+ urls: Optional[List[str]] = None,
4092
+ *,
4093
+ prompt: Optional[str] = None,
4094
+ schema: Optional[Any] = None,
4095
+ system_prompt: Optional[str] = None,
4096
+ allow_external_links: Optional[bool] = False,
4097
+ enable_web_search: Optional[bool] = False,
4098
+ show_sources: Optional[bool] = False,
4099
+ agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
4100
+ """
4101
+ Initiate an asynchronous extraction job without waiting for completion.
4102
+
4103
+ Args:
4104
+ urls (Optional[List[str]]): URLs to extract from
4105
+ prompt (Optional[str]): Custom extraction prompt
4106
+ schema (Optional[Any]): JSON schema/Pydantic model
4107
+ system_prompt (Optional[str]): System context
4108
+ allow_external_links (Optional[bool]): Follow external links
4109
+ enable_web_search (Optional[bool]): Enable web search
4110
+ show_sources (Optional[bool]): Include source URLs
4111
+ agent (Optional[Dict[str, Any]]): Agent configuration
4112
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
4113
+
4114
+ Returns:
4115
+ V1ExtractResponse[Any] with:
4116
+ * success (bool): Whether request succeeded
4117
+ * data (Optional[Any]): Extracted data matching schema
4118
+ * error (Optional[str]): Error message if any
4119
+
4120
+ Raises:
4121
+ ValueError: If job initiation fails
4122
+ """
4123
+ headers = self._prepare_headers()
4124
+
4125
+ if not prompt and not schema:
4126
+ raise ValueError("Either prompt or schema is required")
4127
+
4128
+ if not urls and not prompt:
4129
+ raise ValueError("Either urls or prompt is required")
4130
+
4131
+ if schema:
4132
+ schema = self._ensure_schema_dict(schema)
4133
+
4134
+ request_data = V1ExtractResponse(
4135
+ urls=urls or [],
4136
+ allowExternalLinks=allow_external_links,
4137
+ enableWebSearch=enable_web_search,
4138
+ showSources=show_sources,
4139
+ schema=schema,
4140
+ origin=f'python-sdk@{version}'
4141
+ )
4142
+
4143
+ if prompt:
4144
+ request_data['prompt'] = prompt
4145
+ if system_prompt:
4146
+ request_data['systemPrompt'] = system_prompt
4147
+ if agent:
4148
+ request_data['agent'] = agent
4149
+
4150
+ try:
4151
+ return await self._async_post_request(
4152
+ f'{self.api_url}/v1/extract',
4153
+ request_data,
4154
+ headers
4155
+ )
4156
+ except Exception as e:
4157
+ raise ValueError(str(e))
4158
+
4159
+ async def generate_llms_text(
4160
+ self,
4161
+ url: str,
4162
+ *,
4163
+ max_urls: Optional[int] = None,
4164
+ show_full_text: Optional[bool] = None,
4165
+ experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextStatusResponse:
4166
+ """
4167
+ Generate LLMs.txt for a given URL and monitor until completion.
4168
+
4169
+ Args:
4170
+ url (str): Target URL to generate LLMs.txt from
4171
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4172
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4173
+ experimental_stream (Optional[bool]): Enable experimental streaming
4174
+
4175
+ Returns:
4176
+ V1GenerateLLMsTextStatusResponse containing:
4177
+ * success (bool): Whether generation completed successfully
4178
+ * status (str): Status of generation (processing/completed/failed)
4179
+ * data (Dict[str, str], optional): Generated text with fields:
4180
+ - llmstxt (str): Generated LLMs.txt content
4181
+ - llmsfulltxt (str, optional): Full version if requested
4182
+ * error (str, optional): Error message if generation failed
4183
+ * expiresAt (str): When the generated data expires
4184
+
4185
+ Raises:
4186
+ Exception: If generation fails
4187
+ """
4188
+ params = {}
4189
+ if max_urls is not None:
4190
+ params['maxUrls'] = max_urls
4191
+ if show_full_text is not None:
4192
+ params['showFullText'] = show_full_text
4193
+ if experimental_stream is not None:
4194
+ params['__experimental_stream'] = experimental_stream
4195
+
4196
+ response = await self.async_generate_llms_text(
4197
+ url,
4198
+ max_urls=max_urls,
4199
+ show_full_text=show_full_text,
4200
+ experimental_stream=experimental_stream
4201
+ )
4202
+ if not response.get('success') or 'id' not in response:
4203
+ return response
4204
+
4205
+ job_id = response['id']
4206
+ while True:
4207
+ status = await self.check_generate_llms_text_status(job_id)
4208
+
4209
+ if status['status'] == 'completed':
4210
+ return status
4211
+ elif status['status'] == 'failed':
4212
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4213
+ elif status['status'] != 'processing':
4214
+ break
4215
+
4216
+ await asyncio.sleep(2)
4217
+
4218
+ return V1GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly', status='failed', expiresAt='')
4219
+
4220
+ async def async_generate_llms_text(
4221
+ self,
4222
+ url: str,
4223
+ *,
4224
+ max_urls: Optional[int] = None,
4225
+ show_full_text: Optional[bool] = None,
4226
+ cache: Optional[bool] = None,
4227
+ experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextResponse:
4228
+ """
4229
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4230
+
4231
+ Args:
4232
+ url (str): Target URL to generate LLMs.txt from
4233
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4234
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4235
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
4236
+ experimental_stream (Optional[bool]): Enable experimental streaming
4237
+
4238
+ Returns:
4239
+ V1GenerateLLMsTextResponse containing:
4240
+ * success (bool): Whether job started successfully
4241
+ * id (str): Unique identifier for the job
4242
+ * error (str, optional): Error message if start failed
4243
+
4244
+ Raises:
4245
+ ValueError: If job initiation fails
4246
+ """
4247
+ params = {}
4248
+ if max_urls is not None:
4249
+ params['maxUrls'] = max_urls
4250
+ if show_full_text is not None:
4251
+ params['showFullText'] = show_full_text
4252
+ if experimental_stream is not None:
4253
+ params['__experimental_stream'] = experimental_stream
4254
+
4255
+ params = V1GenerateLLMsTextParams(
4256
+ maxUrls=max_urls,
4257
+ showFullText=show_full_text,
4258
+ cache=cache,
4259
+ __experimental_stream=experimental_stream
4260
+ )
4261
+
4262
+ headers = self._prepare_headers()
4263
+ json_data = {'url': url, **params.dict(by_alias=True, exclude_none=True)}
4264
+ json_data['origin'] = f"python-sdk@{version}"
4265
+
4266
+ try:
4267
+ return await self._async_post_request(
4268
+ f'{self.api_url}/v1/llmstxt',
4269
+ json_data,
4270
+ headers
4271
+ )
4272
+ except Exception as e:
4273
+ raise ValueError(str(e))
4274
+
4275
+ async def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusResponse:
4276
+ """
4277
+ Check the status of an asynchronous LLMs.txt generation job.
4278
+
4279
+ Args:
4280
+ id (str): The ID of the generation job
4281
+
4282
+ Returns:
4283
+ V1GenerateLLMsTextStatusResponse containing:
4284
+ * success (bool): Whether generation completed successfully
4285
+ * status (str): Status of generation (processing/completed/failed)
4286
+ * data (Dict[str, str], optional): Generated text with fields:
4287
+ - llmstxt (str): Generated LLMs.txt content
4288
+ - llmsfulltxt (str, optional): Full version if requested
4289
+ * error (str, optional): Error message if generation failed
4290
+ * expiresAt (str): When the generated data expires
4291
+
4292
+ Raises:
4293
+ ValueError: If status check fails
4294
+ """
4295
+ headers = self._prepare_headers()
4296
+ try:
4297
+ return await self._async_get_request(
4298
+ f'{self.api_url}/v1/llmstxt/{id}',
4299
+ headers
4300
+ )
4301
+ except Exception as e:
4302
+ raise ValueError(str(e))
4303
+
4304
+ async def deep_research(
4305
+ self,
4306
+ query: str,
4307
+ *,
4308
+ max_depth: Optional[int] = None,
4309
+ time_limit: Optional[int] = None,
4310
+ max_urls: Optional[int] = None,
4311
+ analysis_prompt: Optional[str] = None,
4312
+ system_prompt: Optional[str] = None,
4313
+ __experimental_stream_steps: Optional[bool] = None,
4314
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4315
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> V1DeepResearchStatusResponse:
4316
+ """
4317
+ Initiates a deep research operation on a given query and polls until completion.
4318
+
4319
+ Args:
4320
+ query (str): Research query or topic to investigate
4321
+ max_depth (Optional[int]): Maximum depth of research exploration
4322
+ time_limit (Optional[int]): Time limit in seconds for research
4323
+ max_urls (Optional[int]): Maximum number of URLs to process
4324
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4325
+ system_prompt (Optional[str]): Custom system prompt
4326
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4327
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4328
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4329
+
4330
+ Returns:
4331
+ DeepResearchStatusResponse containing:
4332
+ * success (bool): Whether research completed successfully
4333
+ * status (str): Current state (processing/completed/failed)
4334
+ * error (Optional[str]): Error message if failed
4335
+ * id (str): Unique identifier for the research job
4336
+ * data (Any): Research findings and analysis
4337
+ * sources (List[Dict]): List of discovered sources
4338
+ * activities (List[Dict]): Research progress log
4339
+ * summaries (List[str]): Generated research summaries
4340
+
4341
+ Raises:
4342
+ Exception: If research fails
4343
+ """
4344
+ research_params = {}
4345
+ if max_depth is not None:
4346
+ research_params['maxDepth'] = max_depth
4347
+ if time_limit is not None:
4348
+ research_params['timeLimit'] = time_limit
4349
+ if max_urls is not None:
4350
+ research_params['maxUrls'] = max_urls
4351
+ if analysis_prompt is not None:
4352
+ research_params['analysisPrompt'] = analysis_prompt
4353
+ if system_prompt is not None:
4354
+ research_params['systemPrompt'] = system_prompt
4355
+ if __experimental_stream_steps is not None:
4356
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4357
+ research_params = V1DeepResearchParams(**research_params)
4358
+
4359
+ response = await self.async_deep_research(
4360
+ query,
4361
+ max_depth=max_depth,
4362
+ time_limit=time_limit,
4363
+ max_urls=max_urls,
4364
+ analysis_prompt=analysis_prompt,
4365
+ system_prompt=system_prompt
4366
+ )
4367
+ if not response.get('success') or 'id' not in response:
4368
+ return response
4369
+
4370
+ job_id = response['id']
4371
+ last_activity_count = 0
4372
+ last_source_count = 0
4373
+
4374
+ while True:
4375
+ status = await self.check_deep_research_status(job_id)
4376
+
4377
+ if on_activity and 'activities' in status:
4378
+ new_activities = status['activities'][last_activity_count:]
4379
+ for activity in new_activities:
4380
+ on_activity(activity)
4381
+ last_activity_count = len(status['activities'])
4382
+
4383
+ if on_source and 'sources' in status:
4384
+ new_sources = status['sources'][last_source_count:]
4385
+ for source in new_sources:
4386
+ on_source(source)
4387
+ last_source_count = len(status['sources'])
4388
+
4389
+ if status['status'] == 'completed':
4390
+ return status
4391
+ elif status['status'] == 'failed':
4392
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4393
+ elif status['status'] != 'processing':
4394
+ break
4395
+
4396
+ await asyncio.sleep(2)
4397
+
4398
+ return V1DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4399
+
4400
+ async def async_deep_research(
4401
+ self,
4402
+ query: str,
4403
+ *,
4404
+ max_depth: Optional[int] = None,
4405
+ time_limit: Optional[int] = None,
4406
+ max_urls: Optional[int] = None,
4407
+ analysis_prompt: Optional[str] = None,
4408
+ system_prompt: Optional[str] = None,
4409
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4410
+ """
4411
+ Initiates an asynchronous deep research operation.
4412
+
4413
+ Args:
4414
+ query (str): Research query or topic to investigate
4415
+ max_depth (Optional[int]): Maximum depth of research exploration
4416
+ time_limit (Optional[int]): Time limit in seconds for research
4417
+ max_urls (Optional[int]): Maximum number of URLs to process
4418
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4419
+ system_prompt (Optional[str]): Custom system prompt
4420
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4421
+
4422
+ Returns:
4423
+ Dict[str, Any]: A response containing:
4424
+ * success (bool): Whether the research initiation was successful
4425
+ * id (str): The unique identifier for the research job
4426
+ * error (str, optional): Error message if initiation failed
4427
+
4428
+ Raises:
4429
+ Exception: If the research initiation fails.
4430
+ """
4431
+ research_params = {}
4432
+ if max_depth is not None:
4433
+ research_params['maxDepth'] = max_depth
4434
+ if time_limit is not None:
4435
+ research_params['timeLimit'] = time_limit
4436
+ if max_urls is not None:
4437
+ research_params['maxUrls'] = max_urls
4438
+ if analysis_prompt is not None:
4439
+ research_params['analysisPrompt'] = analysis_prompt
4440
+ if system_prompt is not None:
4441
+ research_params['systemPrompt'] = system_prompt
4442
+ if __experimental_stream_steps is not None:
4443
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4444
+ research_params = V1DeepResearchParams(**research_params)
4445
+
4446
+ headers = self._prepare_headers()
4447
+
4448
+ json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)}
4449
+ json_data['origin'] = f"python-sdk@{version}"
4450
+
4451
+ try:
4452
+ return await self._async_post_request(
4453
+ f'{self.api_url}/v1/deep-research',
4454
+ json_data,
4455
+ headers
4456
+ )
4457
+ except Exception as e:
4458
+ raise ValueError(str(e))
4459
+
4460
+ async def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse:
4461
+ """
4462
+ Check the status of a deep research operation.
4463
+
4464
+ Args:
4465
+ id (str): The ID of the deep research operation.
4466
+
4467
+ Returns:
4468
+ DeepResearchResponse containing:
4469
+
4470
+ Status:
4471
+ * success - Whether research completed successfully
4472
+ * status - Current state (processing/completed/failed)
4473
+ * error - Error message if failed
4474
+
4475
+ Results:
4476
+ * id - Unique identifier for the research job
4477
+ * data - Research findings and analysis
4478
+ * sources - List of discovered sources
4479
+ * activities - Research progress log
4480
+ * summaries - Generated research summaries
4481
+
4482
+ Raises:
4483
+ Exception: If the status check fails.
4484
+ """
4485
+ headers = self._prepare_headers()
4486
+ try:
4487
+ return await self._async_get_request(
4488
+ f'{self.api_url}/v1/deep-research/{id}',
4489
+ headers
4490
+ )
4491
+ except Exception as e:
4492
+ raise ValueError(str(e))
4493
+
4494
+ async def search(
4495
+ self,
4496
+ query: str,
4497
+ *,
4498
+ limit: Optional[int] = None,
4499
+ tbs: Optional[str] = None,
4500
+ filter: Optional[str] = None,
4501
+ lang: Optional[str] = None,
4502
+ country: Optional[str] = None,
4503
+ location: Optional[str] = None,
4504
+ timeout: Optional[int] = 30000,
4505
+ scrape_options: Optional[V1ScrapeOptions] = None,
4506
+ params: Optional[Union[Dict[str, Any], V1SearchParams]] = None,
4507
+ **kwargs) -> V1SearchResponse:
4508
+ """
4509
+ Asynchronously search for content using Firecrawl.
4510
+
4511
+ Args:
4512
+ query (str): Search query string
4513
+ limit (Optional[int]): Max results (default: 5)
4514
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4515
+ filter (Optional[str]): Custom result filter
4516
+ lang (Optional[str]): Language code (default: "en")
4517
+ country (Optional[str]): Country code (default: "us")
4518
+ location (Optional[str]): Geo-targeting
4519
+ timeout (Optional[int]): Request timeout in milliseconds
4520
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4521
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4522
+ **kwargs: Additional keyword arguments for future compatibility
4523
+
4524
+ Returns:
4525
+ SearchResponse: Response containing:
4526
+ * success (bool): Whether request succeeded
4527
+ * data (List[FirecrawlDocument]): Search results
4528
+ * warning (Optional[str]): Warning message if any
4529
+ * error (Optional[str]): Error message if any
4530
+
4531
+ Raises:
4532
+ Exception: If search fails or response cannot be parsed
4533
+ """
4534
+ # Build search parameters
4535
+ search_params = {}
4536
+ if params:
4537
+ if isinstance(params, dict):
4538
+ search_params.update(params)
4539
+ else:
4540
+ search_params.update(params.dict(by_alias=True, exclude_none=True))
4541
+
4542
+ # Add individual parameters
4543
+ if limit is not None:
4544
+ search_params['limit'] = limit
4545
+ if tbs is not None:
4546
+ search_params['tbs'] = tbs
4547
+ if filter is not None:
4548
+ search_params['filter'] = filter
4549
+ if lang is not None:
4550
+ search_params['lang'] = lang
4551
+ if country is not None:
4552
+ search_params['country'] = country
4553
+ if location is not None:
4554
+ search_params['location'] = location
4555
+ if timeout is not None:
4556
+ search_params['timeout'] = timeout
4557
+ if scrape_options is not None:
4558
+ search_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
4559
+
4560
+ # Add any additional kwargs
4561
+ search_params.update(kwargs)
4562
+
4563
+ # Create final params object
4564
+ final_params = V1SearchParams(query=query, **search_params)
4565
+ params_dict = final_params.dict(by_alias=True, exclude_none=True)
4566
+ params_dict['origin'] = f"python-sdk@{version}"
4567
+
4568
+ return await self._async_post_request(
4569
+ f"{self.api_url}/v1/search",
4570
+ params_dict,
4571
+ {"Authorization": f"Bearer {self.api_key}"}
4572
+ )
4573
+
4574
+ class AsyncV1CrawlWatcher(V1CrawlWatcher):
4575
+ """
4576
+ Async version of V1CrawlWatcher that properly handles async operations.
4577
+ """
4578
+ def __init__(self, id: str, app: AsyncV1FirecrawlApp):
4579
+ super().__init__(id, app)
4580
+
4581
+ async def connect(self) -> None:
4582
+ """
4583
+ Establishes async WebSocket connection and starts listening for messages.
4584
+ """
4585
+ async with websockets.connect(
4586
+ self.ws_url,
4587
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4588
+ ) as websocket:
4589
+ await self._listen(websocket)
4590
+
4591
+ async def _listen(self, websocket) -> None:
4592
+ """
4593
+ Listens for incoming WebSocket messages and handles them asynchronously.
4594
+
4595
+ Args:
4596
+ websocket: The WebSocket connection object
4597
+ """
4598
+ async for message in websocket:
4599
+ msg = json.loads(message)
4600
+ await self._handle_message(msg)
4601
+
4602
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4603
+ """
4604
+ Handles incoming WebSocket messages based on their type asynchronously.
4605
+
4606
+ Args:
4607
+ msg (Dict[str, Any]): The message to handle
4608
+ """
4609
+ if msg['type'] == 'done':
4610
+ self.status = 'completed'
4611
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4612
+ elif msg['type'] == 'error':
4613
+ self.status = 'failed'
4614
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4615
+ elif msg['type'] == 'catchup':
4616
+ self.status = msg['data']['status']
4617
+ self.data.extend(msg['data'].get('data', []))
4618
+ for doc in self.data:
4619
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4620
+ elif msg['type'] == 'document':
4621
+ self.data.append(msg['data'])
4622
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4623
+
4624
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4625
+ """
4626
+ Handle errors from async API responses.
4627
+ """
4628
+ try:
4629
+ error_data = await response.json()
4630
+ error_message = error_data.get('error', 'No error message provided.')
4631
+ error_details = error_data.get('details', 'No additional error details provided.')
4632
+ except:
4633
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4634
+
4635
+ # Use the app's method to get the error message
4636
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4637
+
4638
+ raise aiohttp.ClientError(message)
4639
+
4640
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4641
+ """
4642
+ Generate a standardized error message based on HTTP status code for async operations.
4643
+
4644
+ Args:
4645
+ status_code (int): The HTTP status code from the response
4646
+ action (str): Description of the action that was being performed
4647
+ error_message (str): The error message from the API response
4648
+ error_details (str): Additional error details from the API response
4649
+
4650
+ Returns:
4651
+ str: A formatted error message
4652
+ """
4653
+ return self._get_error_message(status_code, action, error_message, error_details)