firecrawl-py 2.6.0__py3-none-any.whl → 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (52) hide show
  1. build/lib/firecrawl/__init__.py +1 -1
  2. build/lib/firecrawl/firecrawl.py +11 -10
  3. firecrawl/__init__.py +1 -1
  4. firecrawl/firecrawl.py +11 -10
  5. {firecrawl_py-2.6.0.dist-info → firecrawl_py-2.7.1.dist-info}/METADATA +1 -1
  6. firecrawl_py-2.7.1.dist-info/RECORD +19 -0
  7. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  8. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  9. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  10. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  11. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  12. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4466
  13. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  14. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  15. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  16. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  17. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  18. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  19. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4466
  20. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  21. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  22. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  23. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  24. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  25. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  26. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4466
  27. build/lib/build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  28. build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  29. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  30. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  31. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  32. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  33. build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4466
  34. build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  35. build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  36. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  37. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  38. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  39. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  40. build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4466
  41. build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  42. build/lib/build/lib/firecrawl/__init__.py +0 -79
  43. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  44. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  45. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  46. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  47. build/lib/build/lib/firecrawl/firecrawl.py +0 -4466
  48. build/lib/build/lib/tests/test_change_tracking.py +0 -98
  49. firecrawl_py-2.6.0.dist-info/RECORD +0 -61
  50. {firecrawl_py-2.6.0.dist-info → firecrawl_py-2.7.1.dist-info}/LICENSE +0 -0
  51. {firecrawl_py-2.6.0.dist-info → firecrawl_py-2.7.1.dist-info}/WHEEL +0 -0
  52. {firecrawl_py-2.6.0.dist-info → firecrawl_py-2.7.1.dist-info}/top_level.txt +0 -0
@@ -1,4466 +0,0 @@
1
- """
2
- FirecrawlApp Module
3
-
4
- This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs. The module uses requests for HTTP communication
7
- and handles retries for certain HTTP status codes.
8
-
9
- Classes:
10
- - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
- """
12
- import logging
13
- import os
14
- import time
15
- from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
- import json
17
- from datetime import datetime
18
- import re
19
- import warnings
20
- import requests
21
- import pydantic
22
- import websockets
23
- import aiohttp
24
- import asyncio
25
- from pydantic import Field
26
-
27
- # Suppress Pydantic warnings about attribute shadowing
28
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
-
34
- def get_version():
35
- try:
36
- from pathlib import Path
37
- package_path = os.path.dirname(__file__)
38
- version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
- if version_match:
41
- return version_match.group(1).strip()
42
- except Exception:
43
- print("Failed to get version from __init__.py")
44
- return None
45
-
46
- version = get_version()
47
-
48
- logger : logging.Logger = logging.getLogger("firecrawl")
49
-
50
- T = TypeVar('T')
51
-
52
- # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
- # """Metadata for a Firecrawl document."""
54
- # title: Optional[str] = None
55
- # description: Optional[str] = None
56
- # language: Optional[str] = None
57
- # keywords: Optional[str] = None
58
- # robots: Optional[str] = None
59
- # ogTitle: Optional[str] = None
60
- # ogDescription: Optional[str] = None
61
- # ogUrl: Optional[str] = None
62
- # ogImage: Optional[str] = None
63
- # ogAudio: Optional[str] = None
64
- # ogDeterminer: Optional[str] = None
65
- # ogLocale: Optional[str] = None
66
- # ogLocaleAlternate: Optional[List[str]] = None
67
- # ogSiteName: Optional[str] = None
68
- # ogVideo: Optional[str] = None
69
- # dctermsCreated: Optional[str] = None
70
- # dcDateCreated: Optional[str] = None
71
- # dcDate: Optional[str] = None
72
- # dctermsType: Optional[str] = None
73
- # dcType: Optional[str] = None
74
- # dctermsAudience: Optional[str] = None
75
- # dctermsSubject: Optional[str] = None
76
- # dcSubject: Optional[str] = None
77
- # dcDescription: Optional[str] = None
78
- # dctermsKeywords: Optional[str] = None
79
- # modifiedTime: Optional[str] = None
80
- # publishedTime: Optional[str] = None
81
- # articleTag: Optional[str] = None
82
- # articleSection: Optional[str] = None
83
- # sourceURL: Optional[str] = None
84
- # statusCode: Optional[int] = None
85
- # error: Optional[str] = None
86
-
87
- class AgentOptions(pydantic.BaseModel):
88
- """Configuration for the agent."""
89
- model: Literal["FIRE-1"] = "FIRE-1"
90
- prompt: Optional[str] = None
91
-
92
- class AgentOptionsExtract(pydantic.BaseModel):
93
- """Configuration for the agent in extract operations."""
94
- model: Literal["FIRE-1"] = "FIRE-1"
95
-
96
- class ActionsResult(pydantic.BaseModel):
97
- """Result of actions performed during scraping."""
98
- screenshots: List[str]
99
-
100
- class ChangeTrackingData(pydantic.BaseModel):
101
- """
102
- Data for the change tracking format.
103
- """
104
- previousScrapeAt: Optional[str] = None
105
- changeStatus: str # "new" | "same" | "changed" | "removed"
106
- visibility: str # "visible" | "hidden"
107
- diff: Optional[Dict[str, Any]] = None
108
- json: Optional[Any] = None
109
-
110
- class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
- """Document retrieved or processed by Firecrawl."""
112
- url: Optional[str] = None
113
- markdown: Optional[str] = None
114
- html: Optional[str] = None
115
- rawHtml: Optional[str] = None
116
- links: Optional[List[str]] = None
117
- extract: Optional[T] = None
118
- json: Optional[T] = None
119
- screenshot: Optional[str] = None
120
- metadata: Optional[Any] = None
121
- actions: Optional[ActionsResult] = None
122
- title: Optional[str] = None # v1 search only
123
- description: Optional[str] = None # v1 search only
124
- changeTracking: Optional[ChangeTrackingData] = None
125
-
126
- class LocationConfig(pydantic.BaseModel):
127
- """Location configuration for scraping."""
128
- country: Optional[str] = None
129
- languages: Optional[List[str]] = None
130
-
131
- class WebhookConfig(pydantic.BaseModel):
132
- """Configuration for webhooks."""
133
- url: str
134
- headers: Optional[Dict[str, str]] = None
135
- metadata: Optional[Dict[str, str]] = None
136
- events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
-
138
- class ChangeTrackingOptions(pydantic.BaseModel):
139
- """Configuration for change tracking."""
140
- modes: Optional[List[Literal["git-diff", "json"]]] = None
141
- schema: Optional[Any] = None
142
- prompt: Optional[str] = None
143
-
144
- class ScrapeOptions(pydantic.BaseModel):
145
- """Parameters for scraping operations."""
146
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
147
- headers: Optional[Dict[str, str]] = None
148
- includeTags: Optional[List[str]] = None
149
- excludeTags: Optional[List[str]] = None
150
- onlyMainContent: Optional[bool] = None
151
- waitFor: Optional[int] = None
152
- timeout: Optional[int] = None
153
- location: Optional[LocationConfig] = None
154
- mobile: Optional[bool] = None
155
- skipTlsVerification: Optional[bool] = None
156
- removeBase64Images: Optional[bool] = None
157
- blockAds: Optional[bool] = None
158
- proxy: Optional[Literal["basic", "stealth"]] = None
159
- changeTrackingOptions: Optional[ChangeTrackingOptions] = None
160
-
161
- class WaitAction(pydantic.BaseModel):
162
- """Wait action to perform during scraping."""
163
- type: Literal["wait"]
164
- milliseconds: Optional[int] = None
165
- selector: Optional[str] = None
166
-
167
- class ScreenshotAction(pydantic.BaseModel):
168
- """Screenshot action to perform during scraping."""
169
- type: Literal["screenshot"]
170
- fullPage: Optional[bool] = None
171
-
172
- class ClickAction(pydantic.BaseModel):
173
- """Click action to perform during scraping."""
174
- type: Literal["click"]
175
- selector: str
176
-
177
- class WriteAction(pydantic.BaseModel):
178
- """Write action to perform during scraping."""
179
- type: Literal["write"]
180
- text: str
181
-
182
- class PressAction(pydantic.BaseModel):
183
- """Press action to perform during scraping."""
184
- type: Literal["press"]
185
- key: str
186
-
187
- class ScrollAction(pydantic.BaseModel):
188
- """Scroll action to perform during scraping."""
189
- type: Literal["scroll"]
190
- direction: Literal["up", "down"]
191
- selector: Optional[str] = None
192
-
193
- class ScrapeAction(pydantic.BaseModel):
194
- """Scrape action to perform during scraping."""
195
- type: Literal["scrape"]
196
-
197
- class ExecuteJavascriptAction(pydantic.BaseModel):
198
- """Execute javascript action to perform during scraping."""
199
- type: Literal["executeJavascript"]
200
- script: str
201
-
202
-
203
- class ExtractAgent(pydantic.BaseModel):
204
- """Configuration for the agent in extract operations."""
205
- model: Literal["FIRE-1"] = "FIRE-1"
206
-
207
- class JsonConfig(pydantic.BaseModel):
208
- """Configuration for extraction."""
209
- prompt: Optional[str] = None
210
- schema: Optional[Any] = None
211
- systemPrompt: Optional[str] = None
212
- agent: Optional[ExtractAgent] = None
213
-
214
- class ScrapeParams(ScrapeOptions):
215
- """Parameters for scraping operations."""
216
- extract: Optional[JsonConfig] = None
217
- jsonOptions: Optional[JsonConfig] = None
218
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
219
- agent: Optional[AgentOptions] = None
220
- webhook: Optional[WebhookConfig] = None
221
-
222
- class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
223
- """Response from scraping operations."""
224
- success: bool = True
225
- warning: Optional[str] = None
226
- error: Optional[str] = None
227
-
228
- class BatchScrapeResponse(pydantic.BaseModel):
229
- """Response from batch scrape operations."""
230
- id: Optional[str] = None
231
- url: Optional[str] = None
232
- success: bool = True
233
- error: Optional[str] = None
234
- invalidURLs: Optional[List[str]] = None
235
-
236
- class BatchScrapeStatusResponse(pydantic.BaseModel):
237
- """Response from batch scrape status checks."""
238
- success: bool = True
239
- status: Literal["scraping", "completed", "failed", "cancelled"]
240
- completed: int
241
- total: int
242
- creditsUsed: int
243
- expiresAt: datetime
244
- next: Optional[str] = None
245
- data: List[FirecrawlDocument]
246
-
247
- class CrawlParams(pydantic.BaseModel):
248
- """Parameters for crawling operations."""
249
- includePaths: Optional[List[str]] = None
250
- excludePaths: Optional[List[str]] = None
251
- maxDepth: Optional[int] = None
252
- maxDiscoveryDepth: Optional[int] = None
253
- limit: Optional[int] = None
254
- allowBackwardLinks: Optional[bool] = None
255
- allowExternalLinks: Optional[bool] = None
256
- ignoreSitemap: Optional[bool] = None
257
- scrapeOptions: Optional[ScrapeOptions] = None
258
- webhook: Optional[Union[str, WebhookConfig]] = None
259
- deduplicateSimilarURLs: Optional[bool] = None
260
- ignoreQueryParameters: Optional[bool] = None
261
- regexOnFullURL: Optional[bool] = None
262
- delay: Optional[int] = None # Delay in seconds between scrapes
263
-
264
- class CrawlResponse(pydantic.BaseModel):
265
- """Response from crawling operations."""
266
- id: Optional[str] = None
267
- url: Optional[str] = None
268
- success: bool = True
269
- error: Optional[str] = None
270
-
271
- class CrawlStatusResponse(pydantic.BaseModel):
272
- """Response from crawl status checks."""
273
- success: bool = True
274
- status: Literal["scraping", "completed", "failed", "cancelled"]
275
- completed: int
276
- total: int
277
- creditsUsed: int
278
- expiresAt: datetime
279
- next: Optional[str] = None
280
- data: List[FirecrawlDocument]
281
-
282
- class CrawlErrorsResponse(pydantic.BaseModel):
283
- """Response from crawl/batch scrape error monitoring."""
284
- errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
285
- robotsBlocked: List[str]
286
-
287
- class MapParams(pydantic.BaseModel):
288
- """Parameters for mapping operations."""
289
- search: Optional[str] = None
290
- ignoreSitemap: Optional[bool] = None
291
- includeSubdomains: Optional[bool] = None
292
- sitemapOnly: Optional[bool] = None
293
- limit: Optional[int] = None
294
- timeout: Optional[int] = None
295
-
296
- class MapResponse(pydantic.BaseModel):
297
- """Response from mapping operations."""
298
- success: bool = True
299
- links: Optional[List[str]] = None
300
- error: Optional[str] = None
301
-
302
- class ExtractParams(pydantic.BaseModel):
303
- """Parameters for extracting information from URLs."""
304
- prompt: Optional[str] = None
305
- schema: Optional[Any] = None
306
- systemPrompt: Optional[str] = None
307
- allowExternalLinks: Optional[bool] = None
308
- enableWebSearch: Optional[bool] = None
309
- includeSubdomains: Optional[bool] = None
310
- origin: Optional[str] = None
311
- showSources: Optional[bool] = None
312
- scrapeOptions: Optional[ScrapeOptions] = None
313
-
314
- class ExtractResponse(pydantic.BaseModel, Generic[T]):
315
- """Response from extract operations."""
316
- id: Optional[str] = None
317
- status: Optional[Literal["processing", "completed", "failed"]] = None
318
- expiresAt: Optional[datetime] = None
319
- success: bool = True
320
- data: Optional[T] = None
321
- error: Optional[str] = None
322
- warning: Optional[str] = None
323
- sources: Optional[List[str]] = None
324
-
325
- class SearchParams(pydantic.BaseModel):
326
- query: str
327
- limit: Optional[int] = 5
328
- tbs: Optional[str] = None
329
- filter: Optional[str] = None
330
- lang: Optional[str] = "en"
331
- country: Optional[str] = "us"
332
- location: Optional[str] = None
333
- origin: Optional[str] = "api"
334
- timeout: Optional[int] = 60000
335
- scrapeOptions: Optional[ScrapeOptions] = None
336
-
337
- class SearchResponse(pydantic.BaseModel):
338
- """Response from search operations."""
339
- success: bool = True
340
- data: List[FirecrawlDocument]
341
- warning: Optional[str] = None
342
- error: Optional[str] = None
343
-
344
- class GenerateLLMsTextParams(pydantic.BaseModel):
345
- """
346
- Parameters for the LLMs.txt generation operation.
347
- """
348
- maxUrls: Optional[int] = 10
349
- showFullText: Optional[bool] = False
350
- cache: Optional[bool] = True
351
- __experimental_stream: Optional[bool] = None
352
-
353
- class DeepResearchParams(pydantic.BaseModel):
354
- """
355
- Parameters for the deep research operation.
356
- """
357
- maxDepth: Optional[int] = 7
358
- timeLimit: Optional[int] = 270
359
- maxUrls: Optional[int] = 20
360
- analysisPrompt: Optional[str] = None
361
- systemPrompt: Optional[str] = None
362
- __experimental_streamSteps: Optional[bool] = None
363
-
364
- class DeepResearchResponse(pydantic.BaseModel):
365
- """
366
- Response from the deep research operation.
367
- """
368
- success: bool
369
- id: str
370
- error: Optional[str] = None
371
-
372
- class DeepResearchStatusResponse(pydantic.BaseModel):
373
- """
374
- Status response from the deep research operation.
375
- """
376
- success: bool
377
- data: Optional[Dict[str, Any]] = None
378
- status: str
379
- error: Optional[str] = None
380
- expiresAt: str
381
- currentDepth: int
382
- maxDepth: int
383
- activities: List[Dict[str, Any]]
384
- sources: List[Dict[str, Any]]
385
- summaries: List[str]
386
-
387
- class GenerateLLMsTextResponse(pydantic.BaseModel):
388
- """Response from LLMs.txt generation operations."""
389
- success: bool = True
390
- id: str
391
- error: Optional[str] = None
392
-
393
- class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
394
- llmstxt: str
395
- llmsfulltxt: Optional[str] = None
396
-
397
- class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
398
- """Status response from LLMs.txt generation operations."""
399
- success: bool = True
400
- data: Optional[GenerateLLMsTextStatusResponseData] = None
401
- status: Literal["processing", "completed", "failed"]
402
- error: Optional[str] = None
403
- expiresAt: str
404
-
405
- class SearchResponse(pydantic.BaseModel):
406
- """
407
- Response from the search operation.
408
- """
409
- success: bool
410
- data: List[Dict[str, Any]]
411
- warning: Optional[str] = None
412
- error: Optional[str] = None
413
-
414
- class ExtractParams(pydantic.BaseModel):
415
- """
416
- Parameters for the extract operation.
417
- """
418
- prompt: Optional[str] = None
419
- schema: Optional[Any] = pydantic.Field(None, alias='schema')
420
- system_prompt: Optional[str] = None
421
- allow_external_links: Optional[bool] = False
422
- enable_web_search: Optional[bool] = False
423
- # Just for backwards compatibility
424
- enableWebSearch: Optional[bool] = False
425
- show_sources: Optional[bool] = False
426
- agent: Optional[Dict[str, Any]] = None
427
-
428
- class FirecrawlApp:
429
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
430
- """
431
- Initialize the FirecrawlApp instance with API key, API URL.
432
-
433
- Args:
434
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
435
- api_url (Optional[str]): Base URL for the Firecrawl API.
436
- """
437
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
438
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
439
-
440
- # Only require API key when using cloud service
441
- if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
442
- logger.warning("No API key provided for cloud service")
443
- raise ValueError('No API key provided')
444
-
445
- logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
446
-
447
- def scrape_url(
448
- self,
449
- url: str,
450
- *,
451
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
452
- include_tags: Optional[List[str]] = None,
453
- exclude_tags: Optional[List[str]] = None,
454
- only_main_content: Optional[bool] = None,
455
- wait_for: Optional[int] = None,
456
- timeout: Optional[int] = None,
457
- location: Optional[LocationConfig] = None,
458
- mobile: Optional[bool] = None,
459
- skip_tls_verification: Optional[bool] = None,
460
- remove_base64_images: Optional[bool] = None,
461
- block_ads: Optional[bool] = None,
462
- proxy: Optional[Literal["basic", "stealth"]] = None,
463
- extract: Optional[JsonConfig] = None,
464
- json_options: Optional[JsonConfig] = None,
465
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
466
- change_tracking_options: Optional[ChangeTrackingOptions] = None,
467
- **kwargs) -> ScrapeResponse[Any]:
468
- """
469
- Scrape and extract content from a URL.
470
-
471
- Args:
472
- url (str): Target URL to scrape
473
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
474
- include_tags (Optional[List[str]]): HTML tags to include
475
- exclude_tags (Optional[List[str]]): HTML tags to exclude
476
- only_main_content (Optional[bool]): Extract main content only
477
- wait_for (Optional[int]): Wait for a specific element to appear
478
- timeout (Optional[int]): Request timeout (ms)
479
- location (Optional[LocationConfig]): Location configuration
480
- mobile (Optional[bool]): Use mobile user agent
481
- skip_tls_verification (Optional[bool]): Skip TLS verification
482
- remove_base64_images (Optional[bool]): Remove base64 images
483
- block_ads (Optional[bool]): Block ads
484
- proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
485
- extract (Optional[JsonConfig]): Content extraction settings
486
- json_options (Optional[JsonConfig]): JSON extraction settings
487
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
488
- change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
489
-
490
-
491
- Returns:
492
- ScrapeResponse with:
493
- * Requested content formats
494
- * Page metadata
495
- * Extraction results
496
- * Success/error status
497
-
498
- Raises:
499
- Exception: If scraping fails
500
- """
501
- headers = self._prepare_headers()
502
-
503
- # Build scrape parameters
504
- scrape_params = {
505
- 'url': url,
506
- 'origin': f"python-sdk@{version}"
507
- }
508
-
509
- # Add optional parameters if provided
510
- if formats:
511
- scrape_params['formats'] = formats
512
- if include_tags:
513
- scrape_params['includeTags'] = include_tags
514
- if exclude_tags:
515
- scrape_params['excludeTags'] = exclude_tags
516
- if only_main_content is not None:
517
- scrape_params['onlyMainContent'] = only_main_content
518
- if wait_for:
519
- scrape_params['waitFor'] = wait_for
520
- if timeout:
521
- scrape_params['timeout'] = timeout
522
- if location:
523
- scrape_params['location'] = location.dict(exclude_none=True)
524
- if mobile is not None:
525
- scrape_params['mobile'] = mobile
526
- if skip_tls_verification is not None:
527
- scrape_params['skipTlsVerification'] = skip_tls_verification
528
- if remove_base64_images is not None:
529
- scrape_params['removeBase64Images'] = remove_base64_images
530
- if block_ads is not None:
531
- scrape_params['blockAds'] = block_ads
532
- if proxy:
533
- scrape_params['proxy'] = proxy
534
- if extract is not None:
535
- extract = self._ensure_schema_dict(extract)
536
- if isinstance(extract, dict) and "schema" in extract:
537
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
538
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
539
- if json_options is not None:
540
- json_options = self._ensure_schema_dict(json_options)
541
- if isinstance(json_options, dict) and "schema" in json_options:
542
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
543
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
544
- if actions:
545
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
546
- if change_tracking_options:
547
- scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
548
-
549
- scrape_params.update(kwargs)
550
-
551
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
552
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
553
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
554
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
555
-
556
- # Make request
557
- response = requests.post(
558
- f'{self.api_url}/v1/scrape',
559
- headers=headers,
560
- json=scrape_params,
561
- timeout=(timeout + 5000 if timeout else None)
562
- )
563
-
564
- if response.status_code == 200:
565
- try:
566
- response_json = response.json()
567
- if response_json.get('success') and 'data' in response_json:
568
- return ScrapeResponse(**response_json['data'])
569
- elif "error" in response_json:
570
- raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
571
- else:
572
- raise Exception(f'Failed to scrape URL. Error: {response_json}')
573
- except ValueError:
574
- raise Exception('Failed to parse Firecrawl response as JSON.')
575
- else:
576
- self._handle_error(response, 'scrape URL')
577
-
578
- def search(
579
- self,
580
- query: str,
581
- *,
582
- limit: Optional[int] = None,
583
- tbs: Optional[str] = None,
584
- filter: Optional[str] = None,
585
- lang: Optional[str] = None,
586
- country: Optional[str] = None,
587
- location: Optional[str] = None,
588
- timeout: Optional[int] = None,
589
- scrape_options: Optional[ScrapeOptions] = None,
590
- **kwargs) -> SearchResponse:
591
- """
592
- Search for content using Firecrawl.
593
-
594
- Args:
595
- query (str): Search query string
596
- limit (Optional[int]): Max results (default: 5)
597
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
598
- filter (Optional[str]): Custom result filter
599
- lang (Optional[str]): Language code (default: "en")
600
- country (Optional[str]): Country code (default: "us")
601
- location (Optional[str]): Geo-targeting
602
- timeout (Optional[int]): Request timeout in milliseconds
603
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
604
- **kwargs: Additional keyword arguments for future compatibility
605
-
606
- Returns:
607
- SearchResponse: Response containing:
608
- * success (bool): Whether request succeeded
609
- * data (List[FirecrawlDocument]): Search results
610
- * warning (Optional[str]): Warning message if any
611
- * error (Optional[str]): Error message if any
612
-
613
- Raises:
614
- Exception: If search fails or response cannot be parsed
615
- """
616
- # Validate any additional kwargs
617
- self._validate_kwargs(kwargs, "search")
618
-
619
- # Build search parameters
620
- search_params = {}
621
-
622
- # Add individual parameters
623
- if limit is not None:
624
- search_params['limit'] = limit
625
- if tbs is not None:
626
- search_params['tbs'] = tbs
627
- if filter is not None:
628
- search_params['filter'] = filter
629
- if lang is not None:
630
- search_params['lang'] = lang
631
- if country is not None:
632
- search_params['country'] = country
633
- if location is not None:
634
- search_params['location'] = location
635
- if timeout is not None:
636
- search_params['timeout'] = timeout
637
- if scrape_options is not None:
638
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
639
-
640
- # Add any additional kwargs
641
- search_params.update(kwargs)
642
-
643
- # Create final params object
644
- final_params = SearchParams(query=query, **search_params)
645
- params_dict = final_params.dict(exclude_none=True)
646
- params_dict['origin'] = f"python-sdk@{version}"
647
-
648
- # Make request
649
- response = requests.post(
650
- f"{self.api_url}/v1/search",
651
- headers={"Authorization": f"Bearer {self.api_key}"},
652
- json=params_dict
653
- )
654
-
655
- if response.status_code == 200:
656
- try:
657
- response_json = response.json()
658
- if response_json.get('success') and 'data' in response_json:
659
- return SearchResponse(**response_json)
660
- elif "error" in response_json:
661
- raise Exception(f'Search failed. Error: {response_json["error"]}')
662
- else:
663
- raise Exception(f'Search failed. Error: {response_json}')
664
- except ValueError:
665
- raise Exception('Failed to parse Firecrawl response as JSON.')
666
- else:
667
- self._handle_error(response, 'search')
668
-
669
- def crawl_url(
670
- self,
671
- url: str,
672
- *,
673
- include_paths: Optional[List[str]] = None,
674
- exclude_paths: Optional[List[str]] = None,
675
- max_depth: Optional[int] = None,
676
- max_discovery_depth: Optional[int] = None,
677
- limit: Optional[int] = None,
678
- allow_backward_links: Optional[bool] = None,
679
- allow_external_links: Optional[bool] = None,
680
- ignore_sitemap: Optional[bool] = None,
681
- scrape_options: Optional[ScrapeOptions] = None,
682
- webhook: Optional[Union[str, WebhookConfig]] = None,
683
- deduplicate_similar_urls: Optional[bool] = None,
684
- ignore_query_parameters: Optional[bool] = None,
685
- regex_on_full_url: Optional[bool] = None,
686
- delay: Optional[int] = None,
687
- poll_interval: Optional[int] = 2,
688
- idempotency_key: Optional[str] = None,
689
- **kwargs
690
- ) -> CrawlStatusResponse:
691
- """
692
- Crawl a website starting from a URL.
693
-
694
- Args:
695
- url (str): Target URL to start crawling from
696
- include_paths (Optional[List[str]]): Patterns of URLs to include
697
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
698
- max_depth (Optional[int]): Maximum crawl depth
699
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
700
- limit (Optional[int]): Maximum pages to crawl
701
- allow_backward_links (Optional[bool]): Follow parent directory links
702
- allow_external_links (Optional[bool]): Follow external domain links
703
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
704
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
705
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
706
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
707
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
708
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
709
- delay (Optional[int]): Delay in seconds between scrapes
710
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
711
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
712
- **kwargs: Additional parameters to pass to the API
713
-
714
- Returns:
715
- CrawlStatusResponse with:
716
- * Crawling status and progress
717
- * Crawled page contents
718
- * Success/error information
719
-
720
- Raises:
721
- Exception: If crawl fails
722
- """
723
- # Validate any additional kwargs
724
- self._validate_kwargs(kwargs, "crawl_url")
725
-
726
- crawl_params = {}
727
-
728
- # Add individual parameters
729
- if include_paths is not None:
730
- crawl_params['includePaths'] = include_paths
731
- if exclude_paths is not None:
732
- crawl_params['excludePaths'] = exclude_paths
733
- if max_depth is not None:
734
- crawl_params['maxDepth'] = max_depth
735
- if max_discovery_depth is not None:
736
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
737
- if limit is not None:
738
- crawl_params['limit'] = limit
739
- if allow_backward_links is not None:
740
- crawl_params['allowBackwardLinks'] = allow_backward_links
741
- if allow_external_links is not None:
742
- crawl_params['allowExternalLinks'] = allow_external_links
743
- if ignore_sitemap is not None:
744
- crawl_params['ignoreSitemap'] = ignore_sitemap
745
- if scrape_options is not None:
746
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
747
- if webhook is not None:
748
- crawl_params['webhook'] = webhook
749
- if deduplicate_similar_urls is not None:
750
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
751
- if ignore_query_parameters is not None:
752
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
753
- if regex_on_full_url is not None:
754
- crawl_params['regexOnFullURL'] = regex_on_full_url
755
- if delay is not None:
756
- crawl_params['delay'] = delay
757
-
758
- # Add any additional kwargs
759
- crawl_params.update(kwargs)
760
-
761
- # Create final params object
762
- final_params = CrawlParams(**crawl_params)
763
- params_dict = final_params.dict(exclude_none=True)
764
- params_dict['url'] = url
765
- params_dict['origin'] = f"python-sdk@{version}"
766
-
767
- # Make request
768
- headers = self._prepare_headers(idempotency_key)
769
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
770
-
771
- if response.status_code == 200:
772
- try:
773
- id = response.json().get('id')
774
- except:
775
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
776
- return self._monitor_job_status(id, headers, poll_interval)
777
- else:
778
- self._handle_error(response, 'start crawl job')
779
-
780
- def async_crawl_url(
781
- self,
782
- url: str,
783
- *,
784
- include_paths: Optional[List[str]] = None,
785
- exclude_paths: Optional[List[str]] = None,
786
- max_depth: Optional[int] = None,
787
- max_discovery_depth: Optional[int] = None,
788
- limit: Optional[int] = None,
789
- allow_backward_links: Optional[bool] = None,
790
- allow_external_links: Optional[bool] = None,
791
- ignore_sitemap: Optional[bool] = None,
792
- scrape_options: Optional[ScrapeOptions] = None,
793
- webhook: Optional[Union[str, WebhookConfig]] = None,
794
- deduplicate_similar_urls: Optional[bool] = None,
795
- ignore_query_parameters: Optional[bool] = None,
796
- regex_on_full_url: Optional[bool] = None,
797
- delay: Optional[int] = None,
798
- idempotency_key: Optional[str] = None,
799
- **kwargs
800
- ) -> CrawlResponse:
801
- """
802
- Start an asynchronous crawl job.
803
-
804
- Args:
805
- url (str): Target URL to start crawling from
806
- include_paths (Optional[List[str]]): Patterns of URLs to include
807
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
808
- max_depth (Optional[int]): Maximum crawl depth
809
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
810
- limit (Optional[int]): Maximum pages to crawl
811
- allow_backward_links (Optional[bool]): Follow parent directory links
812
- allow_external_links (Optional[bool]): Follow external domain links
813
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
814
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
815
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
816
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
817
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
818
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
819
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
820
- **kwargs: Additional parameters to pass to the API
821
-
822
- Returns:
823
- CrawlResponse with:
824
- * success - Whether crawl started successfully
825
- * id - Unique identifier for the crawl job
826
- * url - Status check URL for the crawl
827
- * error - Error message if start failed
828
-
829
- Raises:
830
- Exception: If crawl initiation fails
831
- """
832
- # Validate any additional kwargs
833
- self._validate_kwargs(kwargs, "async_crawl_url")
834
-
835
- crawl_params = {}
836
-
837
- # Add individual parameters
838
- if include_paths is not None:
839
- crawl_params['includePaths'] = include_paths
840
- if exclude_paths is not None:
841
- crawl_params['excludePaths'] = exclude_paths
842
- if max_depth is not None:
843
- crawl_params['maxDepth'] = max_depth
844
- if max_discovery_depth is not None:
845
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
846
- if limit is not None:
847
- crawl_params['limit'] = limit
848
- if allow_backward_links is not None:
849
- crawl_params['allowBackwardLinks'] = allow_backward_links
850
- if allow_external_links is not None:
851
- crawl_params['allowExternalLinks'] = allow_external_links
852
- if ignore_sitemap is not None:
853
- crawl_params['ignoreSitemap'] = ignore_sitemap
854
- if scrape_options is not None:
855
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
856
- if webhook is not None:
857
- crawl_params['webhook'] = webhook
858
- if deduplicate_similar_urls is not None:
859
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
860
- if ignore_query_parameters is not None:
861
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
862
- if regex_on_full_url is not None:
863
- crawl_params['regexOnFullURL'] = regex_on_full_url
864
- if delay is not None:
865
- crawl_params['delay'] = delay
866
-
867
- # Add any additional kwargs
868
- crawl_params.update(kwargs)
869
-
870
- # Create final params object
871
- final_params = CrawlParams(**crawl_params)
872
- params_dict = final_params.dict(exclude_none=True)
873
- params_dict['url'] = url
874
- params_dict['origin'] = f"python-sdk@{version}"
875
-
876
- # Make request
877
- headers = self._prepare_headers(idempotency_key)
878
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
879
-
880
- if response.status_code == 200:
881
- try:
882
- return CrawlResponse(**response.json())
883
- except:
884
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
885
- else:
886
- self._handle_error(response, 'start crawl job')
887
-
888
- def check_crawl_status(self, id: str) -> CrawlStatusResponse:
889
- """
890
- Check the status and results of a crawl job.
891
-
892
- Args:
893
- id: Unique identifier for the crawl job
894
-
895
- Returns:
896
- CrawlStatusResponse containing:
897
-
898
- Status Information:
899
- * status - Current state (scraping/completed/failed/cancelled)
900
- * completed - Number of pages crawled
901
- * total - Total pages to crawl
902
- * creditsUsed - API credits consumed
903
- * expiresAt - Data expiration timestamp
904
-
905
- Results:
906
- * data - List of crawled documents
907
- * next - URL for next page of results (if paginated)
908
- * success - Whether status check succeeded
909
- * error - Error message if failed
910
-
911
- Raises:
912
- Exception: If status check fails
913
- """
914
- endpoint = f'/v1/crawl/{id}'
915
-
916
- headers = self._prepare_headers()
917
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
918
- if response.status_code == 200:
919
- try:
920
- status_data = response.json()
921
- except:
922
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
923
- if status_data['status'] == 'completed':
924
- if 'data' in status_data:
925
- data = status_data['data']
926
- while 'next' in status_data:
927
- if len(status_data['data']) == 0:
928
- break
929
- next_url = status_data.get('next')
930
- if not next_url:
931
- logger.warning("Expected 'next' URL is missing.")
932
- break
933
- try:
934
- status_response = self._get_request(next_url, headers)
935
- if status_response.status_code != 200:
936
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
937
- break
938
- try:
939
- next_data = status_response.json()
940
- except:
941
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
942
- data.extend(next_data.get('data', []))
943
- status_data = next_data
944
- except Exception as e:
945
- logger.error(f"Error during pagination request: {e}")
946
- break
947
- status_data['data'] = data
948
-
949
- response = {
950
- 'status': status_data.get('status'),
951
- 'total': status_data.get('total'),
952
- 'completed': status_data.get('completed'),
953
- 'creditsUsed': status_data.get('creditsUsed'),
954
- 'expiresAt': status_data.get('expiresAt'),
955
- 'data': status_data.get('data')
956
- }
957
-
958
- if 'error' in status_data:
959
- response['error'] = status_data['error']
960
-
961
- if 'next' in status_data:
962
- response['next'] = status_data['next']
963
-
964
- return CrawlStatusResponse(
965
- success=False if 'error' in status_data else True,
966
- **response
967
- )
968
- else:
969
- self._handle_error(response, 'check crawl status')
970
-
971
- def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
972
- """
973
- Returns information about crawl errors.
974
-
975
- Args:
976
- id (str): The ID of the crawl job
977
-
978
- Returns:
979
- CrawlErrorsResponse containing:
980
- * errors (List[Dict[str, str]]): List of errors with fields:
981
- - id (str): Error ID
982
- - timestamp (str): When the error occurred
983
- - url (str): URL that caused the error
984
- - error (str): Error message
985
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
986
-
987
- Raises:
988
- Exception: If error check fails
989
- """
990
- headers = self._prepare_headers()
991
- response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
992
- if response.status_code == 200:
993
- try:
994
- return CrawlErrorsResponse(**response.json())
995
- except:
996
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
997
- else:
998
- self._handle_error(response, "check crawl errors")
999
-
1000
- def cancel_crawl(self, id: str) -> Dict[str, Any]:
1001
- """
1002
- Cancel an asynchronous crawl job.
1003
-
1004
- Args:
1005
- id (str): The ID of the crawl job to cancel
1006
-
1007
- Returns:
1008
- Dict[str, Any] containing:
1009
- * success (bool): Whether cancellation was successful
1010
- * error (str, optional): Error message if cancellation failed
1011
-
1012
- Raises:
1013
- Exception: If cancellation fails
1014
- """
1015
- headers = self._prepare_headers()
1016
- response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1017
- if response.status_code == 200:
1018
- try:
1019
- return response.json()
1020
- except:
1021
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1022
- else:
1023
- self._handle_error(response, "cancel crawl job")
1024
-
1025
- def crawl_url_and_watch(
1026
- self,
1027
- url: str,
1028
- *,
1029
- include_paths: Optional[List[str]] = None,
1030
- exclude_paths: Optional[List[str]] = None,
1031
- max_depth: Optional[int] = None,
1032
- max_discovery_depth: Optional[int] = None,
1033
- limit: Optional[int] = None,
1034
- allow_backward_links: Optional[bool] = None,
1035
- allow_external_links: Optional[bool] = None,
1036
- ignore_sitemap: Optional[bool] = None,
1037
- scrape_options: Optional[ScrapeOptions] = None,
1038
- webhook: Optional[Union[str, WebhookConfig]] = None,
1039
- deduplicate_similar_urls: Optional[bool] = None,
1040
- ignore_query_parameters: Optional[bool] = None,
1041
- regex_on_full_url: Optional[bool] = None,
1042
- idempotency_key: Optional[str] = None,
1043
- **kwargs
1044
- ) -> 'CrawlWatcher':
1045
- """
1046
- Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1047
-
1048
- Args:
1049
- url (str): Target URL to start crawling from
1050
- include_paths (Optional[List[str]]): Patterns of URLs to include
1051
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1052
- max_depth (Optional[int]): Maximum crawl depth
1053
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1054
- limit (Optional[int]): Maximum pages to crawl
1055
- allow_backward_links (Optional[bool]): Follow parent directory links
1056
- allow_external_links (Optional[bool]): Follow external domain links
1057
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1058
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1059
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1060
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1061
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
1062
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
1063
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1064
- **kwargs: Additional parameters to pass to the API
1065
-
1066
- Returns:
1067
- CrawlWatcher: An instance to monitor the crawl job via WebSocket
1068
-
1069
- Raises:
1070
- Exception: If crawl job fails to start
1071
- """
1072
- crawl_response = self.async_crawl_url(
1073
- url,
1074
- include_paths=include_paths,
1075
- exclude_paths=exclude_paths,
1076
- max_depth=max_depth,
1077
- max_discovery_depth=max_discovery_depth,
1078
- limit=limit,
1079
- allow_backward_links=allow_backward_links,
1080
- allow_external_links=allow_external_links,
1081
- ignore_sitemap=ignore_sitemap,
1082
- scrape_options=scrape_options,
1083
- webhook=webhook,
1084
- deduplicate_similar_urls=deduplicate_similar_urls,
1085
- ignore_query_parameters=ignore_query_parameters,
1086
- regex_on_full_url=regex_on_full_url,
1087
- idempotency_key=idempotency_key,
1088
- **kwargs
1089
- )
1090
- if crawl_response.success and crawl_response.id:
1091
- return CrawlWatcher(crawl_response.id, self)
1092
- else:
1093
- raise Exception("Crawl job failed to start")
1094
-
1095
- def map_url(
1096
- self,
1097
- url: str,
1098
- *,
1099
- search: Optional[str] = None,
1100
- ignore_sitemap: Optional[bool] = None,
1101
- include_subdomains: Optional[bool] = None,
1102
- sitemap_only: Optional[bool] = None,
1103
- limit: Optional[int] = None,
1104
- timeout: Optional[int] = None,
1105
- **kwargs) -> MapResponse:
1106
- """
1107
- Map and discover links from a URL.
1108
-
1109
- Args:
1110
- url (str): Target URL to map
1111
- search (Optional[str]): Filter pattern for URLs
1112
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1113
- include_subdomains (Optional[bool]): Include subdomain links
1114
- sitemap_only (Optional[bool]): Only use sitemap.xml
1115
- limit (Optional[int]): Maximum URLs to return
1116
- timeout (Optional[int]): Request timeout in milliseconds
1117
- **kwargs: Additional parameters to pass to the API
1118
-
1119
- Returns:
1120
- MapResponse: Response containing:
1121
- * success (bool): Whether request succeeded
1122
- * links (List[str]): Discovered URLs
1123
- * error (Optional[str]): Error message if any
1124
-
1125
- Raises:
1126
- Exception: If mapping fails or response cannot be parsed
1127
- """
1128
- # Validate any additional kwargs
1129
- self._validate_kwargs(kwargs, "map_url")
1130
-
1131
- # Build map parameters
1132
- map_params = {}
1133
-
1134
- # Add individual parameters
1135
- if search is not None:
1136
- map_params['search'] = search
1137
- if ignore_sitemap is not None:
1138
- map_params['ignoreSitemap'] = ignore_sitemap
1139
- if include_subdomains is not None:
1140
- map_params['includeSubdomains'] = include_subdomains
1141
- if sitemap_only is not None:
1142
- map_params['sitemapOnly'] = sitemap_only
1143
- if limit is not None:
1144
- map_params['limit'] = limit
1145
- if timeout is not None:
1146
- map_params['timeout'] = timeout
1147
-
1148
- # Add any additional kwargs
1149
- map_params.update(kwargs)
1150
-
1151
- # Create final params object
1152
- final_params = MapParams(**map_params)
1153
- params_dict = final_params.dict(exclude_none=True)
1154
- params_dict['url'] = url
1155
- params_dict['origin'] = f"python-sdk@{version}"
1156
-
1157
- # Make request
1158
- response = requests.post(
1159
- f"{self.api_url}/v1/map",
1160
- headers={"Authorization": f"Bearer {self.api_key}"},
1161
- json=params_dict
1162
- )
1163
-
1164
- if response.status_code == 200:
1165
- try:
1166
- response_json = response.json()
1167
- if response_json.get('success') and 'links' in response_json:
1168
- return MapResponse(**response_json)
1169
- elif "error" in response_json:
1170
- raise Exception(f'Map failed. Error: {response_json["error"]}')
1171
- else:
1172
- raise Exception(f'Map failed. Error: {response_json}')
1173
- except ValueError:
1174
- raise Exception('Failed to parse Firecrawl response as JSON.')
1175
- else:
1176
- self._handle_error(response, 'map')
1177
-
1178
- def batch_scrape_urls(
1179
- self,
1180
- urls: List[str],
1181
- *,
1182
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1183
- headers: Optional[Dict[str, str]] = None,
1184
- include_tags: Optional[List[str]] = None,
1185
- exclude_tags: Optional[List[str]] = None,
1186
- only_main_content: Optional[bool] = None,
1187
- wait_for: Optional[int] = None,
1188
- timeout: Optional[int] = None,
1189
- location: Optional[LocationConfig] = None,
1190
- mobile: Optional[bool] = None,
1191
- skip_tls_verification: Optional[bool] = None,
1192
- remove_base64_images: Optional[bool] = None,
1193
- block_ads: Optional[bool] = None,
1194
- proxy: Optional[Literal["basic", "stealth"]] = None,
1195
- extract: Optional[JsonConfig] = None,
1196
- json_options: Optional[JsonConfig] = None,
1197
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1198
- agent: Optional[AgentOptions] = None,
1199
- poll_interval: Optional[int] = 2,
1200
- idempotency_key: Optional[str] = None,
1201
- **kwargs
1202
- ) -> BatchScrapeStatusResponse:
1203
- """
1204
- Batch scrape multiple URLs and monitor until completion.
1205
-
1206
- Args:
1207
- urls (List[str]): URLs to scrape
1208
- formats (Optional[List[Literal]]): Content formats to retrieve
1209
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1210
- include_tags (Optional[List[str]]): HTML tags to include
1211
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1212
- only_main_content (Optional[bool]): Extract main content only
1213
- wait_for (Optional[int]): Wait time in milliseconds
1214
- timeout (Optional[int]): Request timeout in milliseconds
1215
- location (Optional[LocationConfig]): Location configuration
1216
- mobile (Optional[bool]): Use mobile user agent
1217
- skip_tls_verification (Optional[bool]): Skip TLS verification
1218
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1219
- block_ads (Optional[bool]): Block advertisements
1220
- proxy (Optional[Literal]): Proxy type to use
1221
- extract (Optional[JsonConfig]): Content extraction config
1222
- json_options (Optional[JsonConfig]): JSON extraction config
1223
- actions (Optional[List[Union]]): Actions to perform
1224
- agent (Optional[AgentOptions]): Agent configuration
1225
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
1226
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1227
- **kwargs: Additional parameters to pass to the API
1228
-
1229
- Returns:
1230
- BatchScrapeStatusResponse with:
1231
- * Scraping status and progress
1232
- * Scraped content for each URL
1233
- * Success/error information
1234
-
1235
- Raises:
1236
- Exception: If batch scrape fails
1237
- """
1238
- # Validate any additional kwargs
1239
- self._validate_kwargs(kwargs, "batch_scrape_urls")
1240
-
1241
- scrape_params = {}
1242
-
1243
- # Add individual parameters
1244
- if formats is not None:
1245
- scrape_params['formats'] = formats
1246
- if headers is not None:
1247
- scrape_params['headers'] = headers
1248
- if include_tags is not None:
1249
- scrape_params['includeTags'] = include_tags
1250
- if exclude_tags is not None:
1251
- scrape_params['excludeTags'] = exclude_tags
1252
- if only_main_content is not None:
1253
- scrape_params['onlyMainContent'] = only_main_content
1254
- if wait_for is not None:
1255
- scrape_params['waitFor'] = wait_for
1256
- if timeout is not None:
1257
- scrape_params['timeout'] = timeout
1258
- if location is not None:
1259
- scrape_params['location'] = location.dict(exclude_none=True)
1260
- if mobile is not None:
1261
- scrape_params['mobile'] = mobile
1262
- if skip_tls_verification is not None:
1263
- scrape_params['skipTlsVerification'] = skip_tls_verification
1264
- if remove_base64_images is not None:
1265
- scrape_params['removeBase64Images'] = remove_base64_images
1266
- if block_ads is not None:
1267
- scrape_params['blockAds'] = block_ads
1268
- if proxy is not None:
1269
- scrape_params['proxy'] = proxy
1270
- if extract is not None:
1271
- extract = self._ensure_schema_dict(extract)
1272
- if isinstance(extract, dict) and "schema" in extract:
1273
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1274
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1275
- if json_options is not None:
1276
- json_options = self._ensure_schema_dict(json_options)
1277
- if isinstance(json_options, dict) and "schema" in json_options:
1278
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1279
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1280
- if actions is not None:
1281
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1282
- if agent is not None:
1283
- scrape_params['agent'] = agent.dict(exclude_none=True)
1284
-
1285
- # Add any additional kwargs
1286
- scrape_params.update(kwargs)
1287
-
1288
- # Create final params object
1289
- final_params = ScrapeParams(**scrape_params)
1290
- params_dict = final_params.dict(exclude_none=True)
1291
- params_dict['urls'] = urls
1292
- params_dict['origin'] = f"python-sdk@{version}"
1293
-
1294
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1295
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1296
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1297
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1298
-
1299
- # Make request
1300
- headers = self._prepare_headers(idempotency_key)
1301
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1302
-
1303
- if response.status_code == 200:
1304
- try:
1305
- id = response.json().get('id')
1306
- except:
1307
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1308
- return self._monitor_job_status(id, headers, poll_interval)
1309
- else:
1310
- self._handle_error(response, 'start batch scrape job')
1311
-
1312
- def async_batch_scrape_urls(
1313
- self,
1314
- urls: List[str],
1315
- *,
1316
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1317
- headers: Optional[Dict[str, str]] = None,
1318
- include_tags: Optional[List[str]] = None,
1319
- exclude_tags: Optional[List[str]] = None,
1320
- only_main_content: Optional[bool] = None,
1321
- wait_for: Optional[int] = None,
1322
- timeout: Optional[int] = None,
1323
- location: Optional[LocationConfig] = None,
1324
- mobile: Optional[bool] = None,
1325
- skip_tls_verification: Optional[bool] = None,
1326
- remove_base64_images: Optional[bool] = None,
1327
- block_ads: Optional[bool] = None,
1328
- proxy: Optional[Literal["basic", "stealth"]] = None,
1329
- extract: Optional[JsonConfig] = None,
1330
- json_options: Optional[JsonConfig] = None,
1331
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1332
- agent: Optional[AgentOptions] = None,
1333
- idempotency_key: Optional[str] = None,
1334
- **kwargs
1335
- ) -> BatchScrapeResponse:
1336
- """
1337
- Initiate a batch scrape job asynchronously.
1338
-
1339
- Args:
1340
- urls (List[str]): URLs to scrape
1341
- formats (Optional[List[Literal]]): Content formats to retrieve
1342
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1343
- include_tags (Optional[List[str]]): HTML tags to include
1344
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1345
- only_main_content (Optional[bool]): Extract main content only
1346
- wait_for (Optional[int]): Wait time in milliseconds
1347
- timeout (Optional[int]): Request timeout in milliseconds
1348
- location (Optional[LocationConfig]): Location configuration
1349
- mobile (Optional[bool]): Use mobile user agent
1350
- skip_tls_verification (Optional[bool]): Skip TLS verification
1351
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1352
- block_ads (Optional[bool]): Block advertisements
1353
- proxy (Optional[Literal]): Proxy type to use
1354
- extract (Optional[JsonConfig]): Content extraction config
1355
- json_options (Optional[JsonConfig]): JSON extraction config
1356
- actions (Optional[List[Union]]): Actions to perform
1357
- agent (Optional[AgentOptions]): Agent configuration
1358
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1359
- **kwargs: Additional parameters to pass to the API
1360
-
1361
- Returns:
1362
- BatchScrapeResponse with:
1363
- * success - Whether job started successfully
1364
- * id - Unique identifier for the job
1365
- * url - Status check URL
1366
- * error - Error message if start failed
1367
-
1368
- Raises:
1369
- Exception: If job initiation fails
1370
- """
1371
- # Validate any additional kwargs
1372
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1373
-
1374
- scrape_params = {}
1375
-
1376
- # Add individual parameters
1377
- if formats is not None:
1378
- scrape_params['formats'] = formats
1379
- if headers is not None:
1380
- scrape_params['headers'] = headers
1381
- if include_tags is not None:
1382
- scrape_params['includeTags'] = include_tags
1383
- if exclude_tags is not None:
1384
- scrape_params['excludeTags'] = exclude_tags
1385
- if only_main_content is not None:
1386
- scrape_params['onlyMainContent'] = only_main_content
1387
- if wait_for is not None:
1388
- scrape_params['waitFor'] = wait_for
1389
- if timeout is not None:
1390
- scrape_params['timeout'] = timeout
1391
- if location is not None:
1392
- scrape_params['location'] = location.dict(exclude_none=True)
1393
- if mobile is not None:
1394
- scrape_params['mobile'] = mobile
1395
- if skip_tls_verification is not None:
1396
- scrape_params['skipTlsVerification'] = skip_tls_verification
1397
- if remove_base64_images is not None:
1398
- scrape_params['removeBase64Images'] = remove_base64_images
1399
- if block_ads is not None:
1400
- scrape_params['blockAds'] = block_ads
1401
- if proxy is not None:
1402
- scrape_params['proxy'] = proxy
1403
- if extract is not None:
1404
- extract = self._ensure_schema_dict(extract)
1405
- if isinstance(extract, dict) and "schema" in extract:
1406
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1407
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1408
- if json_options is not None:
1409
- json_options = self._ensure_schema_dict(json_options)
1410
- if isinstance(json_options, dict) and "schema" in json_options:
1411
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1412
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1413
- if actions is not None:
1414
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1415
- if agent is not None:
1416
- scrape_params['agent'] = agent.dict(exclude_none=True)
1417
-
1418
- # Add any additional kwargs
1419
- scrape_params.update(kwargs)
1420
-
1421
- # Create final params object
1422
- final_params = ScrapeParams(**scrape_params)
1423
- params_dict = final_params.dict(exclude_none=True)
1424
- params_dict['urls'] = urls
1425
- params_dict['origin'] = f"python-sdk@{version}"
1426
-
1427
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1428
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1429
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1430
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1431
-
1432
- # Make request
1433
- headers = self._prepare_headers(idempotency_key)
1434
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1435
-
1436
- if response.status_code == 200:
1437
- try:
1438
- return BatchScrapeResponse(**response.json())
1439
- except:
1440
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1441
- else:
1442
- self._handle_error(response, 'start batch scrape job')
1443
-
1444
- def batch_scrape_urls_and_watch(
1445
- self,
1446
- urls: List[str],
1447
- *,
1448
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1449
- headers: Optional[Dict[str, str]] = None,
1450
- include_tags: Optional[List[str]] = None,
1451
- exclude_tags: Optional[List[str]] = None,
1452
- only_main_content: Optional[bool] = None,
1453
- wait_for: Optional[int] = None,
1454
- timeout: Optional[int] = None,
1455
- location: Optional[LocationConfig] = None,
1456
- mobile: Optional[bool] = None,
1457
- skip_tls_verification: Optional[bool] = None,
1458
- remove_base64_images: Optional[bool] = None,
1459
- block_ads: Optional[bool] = None,
1460
- proxy: Optional[Literal["basic", "stealth"]] = None,
1461
- extract: Optional[JsonConfig] = None,
1462
- json_options: Optional[JsonConfig] = None,
1463
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1464
- agent: Optional[AgentOptions] = None,
1465
- idempotency_key: Optional[str] = None,
1466
- **kwargs
1467
- ) -> 'CrawlWatcher':
1468
- """
1469
- Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1470
-
1471
- Args:
1472
- urls (List[str]): URLs to scrape
1473
- formats (Optional[List[Literal]]): Content formats to retrieve
1474
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1475
- include_tags (Optional[List[str]]): HTML tags to include
1476
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1477
- only_main_content (Optional[bool]): Extract main content only
1478
- wait_for (Optional[int]): Wait time in milliseconds
1479
- timeout (Optional[int]): Request timeout in milliseconds
1480
- location (Optional[LocationConfig]): Location configuration
1481
- mobile (Optional[bool]): Use mobile user agent
1482
- skip_tls_verification (Optional[bool]): Skip TLS verification
1483
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1484
- block_ads (Optional[bool]): Block advertisements
1485
- proxy (Optional[Literal]): Proxy type to use
1486
- extract (Optional[JsonConfig]): Content extraction config
1487
- json_options (Optional[JsonConfig]): JSON extraction config
1488
- actions (Optional[List[Union]]): Actions to perform
1489
- agent (Optional[AgentOptions]): Agent configuration
1490
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1491
- **kwargs: Additional parameters to pass to the API
1492
-
1493
- Returns:
1494
- CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1495
-
1496
- Raises:
1497
- Exception: If batch scrape job fails to start
1498
- """
1499
- # Validate any additional kwargs
1500
- self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1501
-
1502
- scrape_params = {}
1503
-
1504
- # Add individual parameters
1505
- if formats is not None:
1506
- scrape_params['formats'] = formats
1507
- if headers is not None:
1508
- scrape_params['headers'] = headers
1509
- if include_tags is not None:
1510
- scrape_params['includeTags'] = include_tags
1511
- if exclude_tags is not None:
1512
- scrape_params['excludeTags'] = exclude_tags
1513
- if only_main_content is not None:
1514
- scrape_params['onlyMainContent'] = only_main_content
1515
- if wait_for is not None:
1516
- scrape_params['waitFor'] = wait_for
1517
- if timeout is not None:
1518
- scrape_params['timeout'] = timeout
1519
- if location is not None:
1520
- scrape_params['location'] = location.dict(exclude_none=True)
1521
- if mobile is not None:
1522
- scrape_params['mobile'] = mobile
1523
- if skip_tls_verification is not None:
1524
- scrape_params['skipTlsVerification'] = skip_tls_verification
1525
- if remove_base64_images is not None:
1526
- scrape_params['removeBase64Images'] = remove_base64_images
1527
- if block_ads is not None:
1528
- scrape_params['blockAds'] = block_ads
1529
- if proxy is not None:
1530
- scrape_params['proxy'] = proxy
1531
- if extract is not None:
1532
- extract = self._ensure_schema_dict(extract)
1533
- if isinstance(extract, dict) and "schema" in extract:
1534
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1535
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1536
- if json_options is not None:
1537
- json_options = self._ensure_schema_dict(json_options)
1538
- if isinstance(json_options, dict) and "schema" in json_options:
1539
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1540
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1541
- if actions is not None:
1542
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1543
- if agent is not None:
1544
- scrape_params['agent'] = agent.dict(exclude_none=True)
1545
-
1546
- # Add any additional kwargs
1547
- scrape_params.update(kwargs)
1548
-
1549
- # Create final params object
1550
- final_params = ScrapeParams(**scrape_params)
1551
- params_dict = final_params.dict(exclude_none=True)
1552
- params_dict['urls'] = urls
1553
- params_dict['origin'] = f"python-sdk@{version}"
1554
-
1555
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1556
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1557
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1558
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1559
-
1560
- # Make request
1561
- headers = self._prepare_headers(idempotency_key)
1562
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1563
-
1564
- if response.status_code == 200:
1565
- try:
1566
- crawl_response = BatchScrapeResponse(**response.json())
1567
- if crawl_response.success and crawl_response.id:
1568
- return CrawlWatcher(crawl_response.id, self)
1569
- else:
1570
- raise Exception("Batch scrape job failed to start")
1571
- except:
1572
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1573
- else:
1574
- self._handle_error(response, 'start batch scrape job')
1575
-
1576
- def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1577
- """
1578
- Check the status of a batch scrape job using the Firecrawl API.
1579
-
1580
- Args:
1581
- id (str): The ID of the batch scrape job.
1582
-
1583
- Returns:
1584
- BatchScrapeStatusResponse: The status of the batch scrape job.
1585
-
1586
- Raises:
1587
- Exception: If the status check request fails.
1588
- """
1589
- endpoint = f'/v1/batch/scrape/{id}'
1590
-
1591
- headers = self._prepare_headers()
1592
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
1593
- if response.status_code == 200:
1594
- try:
1595
- status_data = response.json()
1596
- except:
1597
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1598
- if status_data['status'] == 'completed':
1599
- if 'data' in status_data:
1600
- data = status_data['data']
1601
- while 'next' in status_data:
1602
- if len(status_data['data']) == 0:
1603
- break
1604
- next_url = status_data.get('next')
1605
- if not next_url:
1606
- logger.warning("Expected 'next' URL is missing.")
1607
- break
1608
- try:
1609
- status_response = self._get_request(next_url, headers)
1610
- if status_response.status_code != 200:
1611
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
1612
- break
1613
- try:
1614
- next_data = status_response.json()
1615
- except:
1616
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1617
- data.extend(next_data.get('data', []))
1618
- status_data = next_data
1619
- except Exception as e:
1620
- logger.error(f"Error during pagination request: {e}")
1621
- break
1622
- status_data['data'] = data
1623
-
1624
- return BatchScrapeStatusResponse(**{
1625
- 'success': False if 'error' in status_data else True,
1626
- 'status': status_data.get('status'),
1627
- 'total': status_data.get('total'),
1628
- 'completed': status_data.get('completed'),
1629
- 'creditsUsed': status_data.get('creditsUsed'),
1630
- 'expiresAt': status_data.get('expiresAt'),
1631
- 'data': status_data.get('data'),
1632
- 'next': status_data.get('next'),
1633
- 'error': status_data.get('error')
1634
- })
1635
- else:
1636
- self._handle_error(response, 'check batch scrape status')
1637
-
1638
- def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1639
- """
1640
- Returns information about batch scrape errors.
1641
-
1642
- Args:
1643
- id (str): The ID of the crawl job.
1644
-
1645
- Returns:
1646
- CrawlErrorsResponse containing:
1647
- * errors (List[Dict[str, str]]): List of errors with fields:
1648
- * id (str): Error ID
1649
- * timestamp (str): When the error occurred
1650
- * url (str): URL that caused the error
1651
- * error (str): Error message
1652
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1653
-
1654
- Raises:
1655
- Exception: If the error check request fails
1656
- """
1657
- headers = self._prepare_headers()
1658
- response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1659
- if response.status_code == 200:
1660
- try:
1661
- return CrawlErrorsResponse(**response.json())
1662
- except:
1663
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1664
- else:
1665
- self._handle_error(response, "check batch scrape errors")
1666
-
1667
- def extract(
1668
- self,
1669
- urls: Optional[List[str]] = None,
1670
- *,
1671
- prompt: Optional[str] = None,
1672
- schema: Optional[Any] = None,
1673
- system_prompt: Optional[str] = None,
1674
- allow_external_links: Optional[bool] = False,
1675
- enable_web_search: Optional[bool] = False,
1676
- show_sources: Optional[bool] = False,
1677
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1678
- """
1679
- Extract structured information from URLs.
1680
-
1681
- Args:
1682
- urls (Optional[List[str]]): URLs to extract from
1683
- prompt (Optional[str]): Custom extraction prompt
1684
- schema (Optional[Any]): JSON schema/Pydantic model
1685
- system_prompt (Optional[str]): System context
1686
- allow_external_links (Optional[bool]): Follow external links
1687
- enable_web_search (Optional[bool]): Enable web search
1688
- show_sources (Optional[bool]): Include source URLs
1689
- agent (Optional[Dict[str, Any]]): Agent configuration
1690
-
1691
- Returns:
1692
- ExtractResponse[Any] with:
1693
- * success (bool): Whether request succeeded
1694
- * data (Optional[Any]): Extracted data matching schema
1695
- * error (Optional[str]): Error message if any
1696
-
1697
- Raises:
1698
- ValueError: If prompt/schema missing or extraction fails
1699
- """
1700
- headers = self._prepare_headers()
1701
-
1702
- if not prompt and not schema:
1703
- raise ValueError("Either prompt or schema is required")
1704
-
1705
- if not urls and not prompt:
1706
- raise ValueError("Either urls or prompt is required")
1707
-
1708
- if schema:
1709
- schema = self._ensure_schema_dict(schema)
1710
-
1711
- request_data = {
1712
- 'urls': urls or [],
1713
- 'allowExternalLinks': allow_external_links,
1714
- 'enableWebSearch': enable_web_search,
1715
- 'showSources': show_sources,
1716
- 'schema': schema,
1717
- 'origin': f'python-sdk@{get_version()}'
1718
- }
1719
-
1720
- # Only add prompt and systemPrompt if they exist
1721
- if prompt:
1722
- request_data['prompt'] = prompt
1723
- if system_prompt:
1724
- request_data['systemPrompt'] = system_prompt
1725
-
1726
- if agent:
1727
- request_data['agent'] = agent
1728
-
1729
- try:
1730
- # Send the initial extract request
1731
- response = self._post_request(
1732
- f'{self.api_url}/v1/extract',
1733
- request_data,
1734
- headers
1735
- )
1736
- if response.status_code == 200:
1737
- try:
1738
- data = response.json()
1739
- except:
1740
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1741
- if data['success']:
1742
- job_id = data.get('id')
1743
- if not job_id:
1744
- raise Exception('Job ID not returned from extract request.')
1745
-
1746
- # Poll for the extract status
1747
- while True:
1748
- status_response = self._get_request(
1749
- f'{self.api_url}/v1/extract/{job_id}',
1750
- headers
1751
- )
1752
- if status_response.status_code == 200:
1753
- try:
1754
- status_data = status_response.json()
1755
- except:
1756
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1757
- if status_data['status'] == 'completed':
1758
- return ExtractResponse(**status_data)
1759
- elif status_data['status'] in ['failed', 'cancelled']:
1760
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1761
- else:
1762
- self._handle_error(status_response, "extract-status")
1763
-
1764
- time.sleep(2) # Polling interval
1765
- else:
1766
- raise Exception(f'Failed to extract. Error: {data["error"]}')
1767
- else:
1768
- self._handle_error(response, "extract")
1769
- except Exception as e:
1770
- raise ValueError(str(e), 500)
1771
-
1772
- return ExtractResponse(success=False, error="Internal server error.")
1773
-
1774
- def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1775
- """
1776
- Retrieve the status of an extract job.
1777
-
1778
- Args:
1779
- job_id (str): The ID of the extract job.
1780
-
1781
- Returns:
1782
- ExtractResponse[Any]: The status of the extract job.
1783
-
1784
- Raises:
1785
- ValueError: If there is an error retrieving the status.
1786
- """
1787
- headers = self._prepare_headers()
1788
- try:
1789
- response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1790
- if response.status_code == 200:
1791
- try:
1792
- return ExtractResponse(**response.json())
1793
- except:
1794
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1795
- else:
1796
- self._handle_error(response, "get extract status")
1797
- except Exception as e:
1798
- raise ValueError(str(e), 500)
1799
-
1800
- def async_extract(
1801
- self,
1802
- urls: Optional[List[str]] = None,
1803
- *,
1804
- prompt: Optional[str] = None,
1805
- schema: Optional[Any] = None,
1806
- system_prompt: Optional[str] = None,
1807
- allow_external_links: Optional[bool] = False,
1808
- enable_web_search: Optional[bool] = False,
1809
- show_sources: Optional[bool] = False,
1810
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1811
- """
1812
- Initiate an asynchronous extract job.
1813
-
1814
- Args:
1815
- urls (List[str]): URLs to extract information from
1816
- prompt (Optional[str]): Custom extraction prompt
1817
- schema (Optional[Any]): JSON schema/Pydantic model
1818
- system_prompt (Optional[str]): System context
1819
- allow_external_links (Optional[bool]): Follow external links
1820
- enable_web_search (Optional[bool]): Enable web search
1821
- show_sources (Optional[bool]): Include source URLs
1822
- agent (Optional[Dict[str, Any]]): Agent configuration
1823
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1824
-
1825
- Returns:
1826
- ExtractResponse[Any] with:
1827
- * success (bool): Whether request succeeded
1828
- * data (Optional[Any]): Extracted data matching schema
1829
- * error (Optional[str]): Error message if any
1830
-
1831
- Raises:
1832
- ValueError: If job initiation fails
1833
- """
1834
- headers = self._prepare_headers()
1835
-
1836
- schema = schema
1837
- if schema:
1838
- schema = self._ensure_schema_dict(schema)
1839
-
1840
- request_data = {
1841
- 'urls': urls,
1842
- 'allowExternalLinks': allow_external_links,
1843
- 'enableWebSearch': enable_web_search,
1844
- 'showSources': show_sources,
1845
- 'schema': schema,
1846
- 'origin': f'python-sdk@{version}'
1847
- }
1848
-
1849
- if prompt:
1850
- request_data['prompt'] = prompt
1851
- if system_prompt:
1852
- request_data['systemPrompt'] = system_prompt
1853
- if agent:
1854
- request_data['agent'] = agent
1855
-
1856
- try:
1857
- response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1858
- if response.status_code == 200:
1859
- try:
1860
- return ExtractResponse(**response.json())
1861
- except:
1862
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1863
- else:
1864
- self._handle_error(response, "async extract")
1865
- except Exception as e:
1866
- raise ValueError(str(e), 500)
1867
-
1868
- def generate_llms_text(
1869
- self,
1870
- url: str,
1871
- *,
1872
- max_urls: Optional[int] = None,
1873
- show_full_text: Optional[bool] = None,
1874
- cache: Optional[bool] = None,
1875
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1876
- """
1877
- Generate LLMs.txt for a given URL and poll until completion.
1878
-
1879
- Args:
1880
- url (str): Target URL to generate LLMs.txt from
1881
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1882
- show_full_text (Optional[bool]): Include full text in output (default: False)
1883
- cache (Optional[bool]): Whether to use cached content if available (default: True)
1884
- experimental_stream (Optional[bool]): Enable experimental streaming
1885
-
1886
- Returns:
1887
- GenerateLLMsTextStatusResponse with:
1888
- * Generated LLMs.txt content
1889
- * Full version if requested
1890
- * Generation status
1891
- * Success/error information
1892
-
1893
- Raises:
1894
- Exception: If generation fails
1895
- """
1896
- params = GenerateLLMsTextParams(
1897
- maxUrls=max_urls,
1898
- showFullText=show_full_text,
1899
- cache=cache,
1900
- __experimental_stream=experimental_stream
1901
- )
1902
-
1903
- response = self.async_generate_llms_text(
1904
- url,
1905
- max_urls=max_urls,
1906
- show_full_text=show_full_text,
1907
- cache=cache,
1908
- experimental_stream=experimental_stream
1909
- )
1910
-
1911
- if not response.success or not response.id:
1912
- return GenerateLLMsTextStatusResponse(
1913
- success=False,
1914
- error='Failed to start LLMs.txt generation',
1915
- status='failed',
1916
- expiresAt=''
1917
- )
1918
-
1919
- job_id = response.id
1920
- while True:
1921
- status = self.check_generate_llms_text_status(job_id)
1922
-
1923
- if status.status == 'completed':
1924
- return status
1925
- elif status.status == 'failed':
1926
- return status
1927
- elif status.status != 'processing':
1928
- return GenerateLLMsTextStatusResponse(
1929
- success=False,
1930
- error='LLMs.txt generation job terminated unexpectedly',
1931
- status='failed',
1932
- expiresAt=''
1933
- )
1934
-
1935
- time.sleep(2) # Polling interval
1936
-
1937
- def async_generate_llms_text(
1938
- self,
1939
- url: str,
1940
- *,
1941
- max_urls: Optional[int] = None,
1942
- show_full_text: Optional[bool] = None,
1943
- cache: Optional[bool] = None,
1944
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1945
- """
1946
- Initiate an asynchronous LLMs.txt generation operation.
1947
-
1948
- Args:
1949
- url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1950
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1951
- show_full_text (Optional[bool]): Include full text in output (default: False)
1952
- cache (Optional[bool]): Whether to use cached content if available (default: True)
1953
- experimental_stream (Optional[bool]): Enable experimental streaming
1954
-
1955
- Returns:
1956
- GenerateLLMsTextResponse: A response containing:
1957
- * success (bool): Whether the generation initiation was successful
1958
- * id (str): The unique identifier for the generation job
1959
- * error (str, optional): Error message if initiation failed
1960
-
1961
- Raises:
1962
- Exception: If the generation job initiation fails.
1963
- """
1964
- params = GenerateLLMsTextParams(
1965
- maxUrls=max_urls,
1966
- showFullText=show_full_text,
1967
- cache=cache,
1968
- __experimental_stream=experimental_stream
1969
- )
1970
-
1971
- headers = self._prepare_headers()
1972
- json_data = {'url': url, **params.dict(exclude_none=True)}
1973
- json_data['origin'] = f"python-sdk@{version}"
1974
-
1975
- try:
1976
- req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1977
- response = req.json()
1978
- print("json_data", json_data)
1979
- print("response", response)
1980
- if response.get('success'):
1981
- try:
1982
- return GenerateLLMsTextResponse(**response)
1983
- except:
1984
- raise Exception('Failed to parse Firecrawl response as JSON.')
1985
- else:
1986
- self._handle_error(response, 'start LLMs.txt generation')
1987
- except Exception as e:
1988
- raise ValueError(str(e))
1989
-
1990
- return GenerateLLMsTextResponse(
1991
- success=False,
1992
- error='Internal server error'
1993
- )
1994
-
1995
- def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
1996
- """
1997
- Check the status of a LLMs.txt generation operation.
1998
-
1999
- Args:
2000
- id (str): The unique identifier of the LLMs.txt generation job to check status for.
2001
-
2002
- Returns:
2003
- GenerateLLMsTextStatusResponse: A response containing:
2004
- * success (bool): Whether the generation was successful
2005
- * status (str): Status of generation ("processing", "completed", "failed")
2006
- * data (Dict[str, str], optional): Generated text with fields:
2007
- * llmstxt (str): Generated LLMs.txt content
2008
- * llmsfulltxt (str, optional): Full version if requested
2009
- * error (str, optional): Error message if generation failed
2010
- * expiresAt (str): When the generated data expires
2011
-
2012
- Raises:
2013
- Exception: If the status check fails.
2014
- """
2015
- headers = self._prepare_headers()
2016
- try:
2017
- response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2018
- if response.status_code == 200:
2019
- try:
2020
- json_data = response.json()
2021
- return GenerateLLMsTextStatusResponse(**json_data)
2022
- except Exception as e:
2023
- raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2024
- elif response.status_code == 404:
2025
- raise Exception('LLMs.txt generation job not found')
2026
- else:
2027
- self._handle_error(response, 'check LLMs.txt generation status')
2028
- except Exception as e:
2029
- raise ValueError(str(e))
2030
-
2031
- return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2032
-
2033
- def _prepare_headers(
2034
- self,
2035
- idempotency_key: Optional[str] = None) -> Dict[str, str]:
2036
- """
2037
- Prepare the headers for API requests.
2038
-
2039
- Args:
2040
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2041
-
2042
- Returns:
2043
- Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2044
- """
2045
- if idempotency_key:
2046
- return {
2047
- 'Content-Type': 'application/json',
2048
- 'Authorization': f'Bearer {self.api_key}',
2049
- 'x-idempotency-key': idempotency_key
2050
- }
2051
-
2052
- return {
2053
- 'Content-Type': 'application/json',
2054
- 'Authorization': f'Bearer {self.api_key}',
2055
- }
2056
-
2057
- def _post_request(
2058
- self,
2059
- url: str,
2060
- data: Dict[str, Any],
2061
- headers: Dict[str, str],
2062
- retries: int = 3,
2063
- backoff_factor: float = 0.5) -> requests.Response:
2064
- """
2065
- Make a POST request with retries.
2066
-
2067
- Args:
2068
- url (str): The URL to send the POST request to.
2069
- data (Dict[str, Any]): The JSON data to include in the POST request.
2070
- headers (Dict[str, str]): The headers to include in the POST request.
2071
- retries (int): Number of retries for the request.
2072
- backoff_factor (float): Backoff factor for retries.
2073
-
2074
- Returns:
2075
- requests.Response: The response from the POST request.
2076
-
2077
- Raises:
2078
- requests.RequestException: If the request fails after the specified retries.
2079
- """
2080
- for attempt in range(retries):
2081
- response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2082
- if response.status_code == 502:
2083
- time.sleep(backoff_factor * (2 ** attempt))
2084
- else:
2085
- return response
2086
- return response
2087
-
2088
- def _get_request(
2089
- self,
2090
- url: str,
2091
- headers: Dict[str, str],
2092
- retries: int = 3,
2093
- backoff_factor: float = 0.5) -> requests.Response:
2094
- """
2095
- Make a GET request with retries.
2096
-
2097
- Args:
2098
- url (str): The URL to send the GET request to.
2099
- headers (Dict[str, str]): The headers to include in the GET request.
2100
- retries (int): Number of retries for the request.
2101
- backoff_factor (float): Backoff factor for retries.
2102
-
2103
- Returns:
2104
- requests.Response: The response from the GET request.
2105
-
2106
- Raises:
2107
- requests.RequestException: If the request fails after the specified retries.
2108
- """
2109
- for attempt in range(retries):
2110
- response = requests.get(url, headers=headers)
2111
- if response.status_code == 502:
2112
- time.sleep(backoff_factor * (2 ** attempt))
2113
- else:
2114
- return response
2115
- return response
2116
-
2117
- def _delete_request(
2118
- self,
2119
- url: str,
2120
- headers: Dict[str, str],
2121
- retries: int = 3,
2122
- backoff_factor: float = 0.5) -> requests.Response:
2123
- """
2124
- Make a DELETE request with retries.
2125
-
2126
- Args:
2127
- url (str): The URL to send the DELETE request to.
2128
- headers (Dict[str, str]): The headers to include in the DELETE request.
2129
- retries (int): Number of retries for the request.
2130
- backoff_factor (float): Backoff factor for retries.
2131
-
2132
- Returns:
2133
- requests.Response: The response from the DELETE request.
2134
-
2135
- Raises:
2136
- requests.RequestException: If the request fails after the specified retries.
2137
- """
2138
- for attempt in range(retries):
2139
- response = requests.delete(url, headers=headers)
2140
- if response.status_code == 502:
2141
- time.sleep(backoff_factor * (2 ** attempt))
2142
- else:
2143
- return response
2144
- return response
2145
-
2146
- def _monitor_job_status(
2147
- self,
2148
- id: str,
2149
- headers: Dict[str, str],
2150
- poll_interval: int) -> CrawlStatusResponse:
2151
- """
2152
- Monitor the status of a crawl job until completion.
2153
-
2154
- Args:
2155
- id (str): The ID of the crawl job.
2156
- headers (Dict[str, str]): The headers to include in the status check requests.
2157
- poll_interval (int): Seconds between status checks.
2158
-
2159
- Returns:
2160
- CrawlStatusResponse: The crawl results if the job is completed successfully.
2161
-
2162
- Raises:
2163
- Exception: If the job fails or an error occurs during status checks.
2164
- """
2165
- while True:
2166
- api_url = f'{self.api_url}/v1/crawl/{id}'
2167
-
2168
- status_response = self._get_request(api_url, headers)
2169
- if status_response.status_code == 200:
2170
- try:
2171
- status_data = status_response.json()
2172
- except:
2173
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2174
- if status_data['status'] == 'completed':
2175
- if 'data' in status_data:
2176
- data = status_data['data']
2177
- while 'next' in status_data:
2178
- if len(status_data['data']) == 0:
2179
- break
2180
- status_response = self._get_request(status_data['next'], headers)
2181
- try:
2182
- status_data = status_response.json()
2183
- except:
2184
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2185
- data.extend(status_data.get('data', []))
2186
- status_data['data'] = data
2187
- return CrawlStatusResponse(**status_data)
2188
- else:
2189
- raise Exception('Crawl job completed but no data was returned')
2190
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2191
- poll_interval=max(poll_interval,2)
2192
- time.sleep(poll_interval) # Wait for the specified interval before checking again
2193
- else:
2194
- raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2195
- else:
2196
- self._handle_error(status_response, 'check crawl status')
2197
-
2198
- def _handle_error(
2199
- self,
2200
- response: requests.Response,
2201
- action: str) -> None:
2202
- """
2203
- Handle errors from API responses.
2204
-
2205
- Args:
2206
- response (requests.Response): The response object from the API request.
2207
- action (str): Description of the action that was being performed.
2208
-
2209
- Raises:
2210
- Exception: An exception with a message containing the status code and error details from the response.
2211
- """
2212
- try:
2213
- error_message = response.json().get('error', 'No error message provided.')
2214
- error_details = response.json().get('details', 'No additional error details provided.')
2215
- except:
2216
- raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2217
-
2218
- message = self._get_error_message(response.status_code, action, error_message, error_details)
2219
-
2220
- # Raise an HTTPError with the custom message and attach the response
2221
- raise requests.exceptions.HTTPError(message, response=response)
2222
-
2223
- def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2224
- """
2225
- Generate a standardized error message based on HTTP status code.
2226
-
2227
- Args:
2228
- status_code (int): The HTTP status code from the response
2229
- action (str): Description of the action that was being performed
2230
- error_message (str): The error message from the API response
2231
- error_details (str): Additional error details from the API response
2232
-
2233
- Returns:
2234
- str: A formatted error message
2235
- """
2236
- if status_code == 402:
2237
- return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2238
- elif status_code == 403:
2239
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2240
- elif status_code == 408:
2241
- return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2242
- elif status_code == 409:
2243
- return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2244
- elif status_code == 500:
2245
- return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2246
- else:
2247
- return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2248
-
2249
- def deep_research(
2250
- self,
2251
- query: str,
2252
- *,
2253
- max_depth: Optional[int] = None,
2254
- time_limit: Optional[int] = None,
2255
- max_urls: Optional[int] = None,
2256
- analysis_prompt: Optional[str] = None,
2257
- system_prompt: Optional[str] = None,
2258
- __experimental_stream_steps: Optional[bool] = None,
2259
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2260
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2261
- """
2262
- Initiates a deep research operation on a given query and polls until completion.
2263
-
2264
- Args:
2265
- query (str): Research query or topic to investigate
2266
- max_depth (Optional[int]): Maximum depth of research exploration
2267
- time_limit (Optional[int]): Time limit in seconds for research
2268
- max_urls (Optional[int]): Maximum number of URLs to process
2269
- analysis_prompt (Optional[str]): Custom prompt for analysis
2270
- system_prompt (Optional[str]): Custom system prompt
2271
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2272
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2273
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2274
-
2275
- Returns:
2276
- DeepResearchStatusResponse containing:
2277
- * success (bool): Whether research completed successfully
2278
- * status (str): Current state (processing/completed/failed)
2279
- * error (Optional[str]): Error message if failed
2280
- * id (str): Unique identifier for the research job
2281
- * data (Any): Research findings and analysis
2282
- * sources (List[Dict]): List of discovered sources
2283
- * activities (List[Dict]): Research progress log
2284
- * summaries (List[str]): Generated research summaries
2285
-
2286
- Raises:
2287
- Exception: If research fails
2288
- """
2289
- research_params = {}
2290
- if max_depth is not None:
2291
- research_params['maxDepth'] = max_depth
2292
- if time_limit is not None:
2293
- research_params['timeLimit'] = time_limit
2294
- if max_urls is not None:
2295
- research_params['maxUrls'] = max_urls
2296
- if analysis_prompt is not None:
2297
- research_params['analysisPrompt'] = analysis_prompt
2298
- if system_prompt is not None:
2299
- research_params['systemPrompt'] = system_prompt
2300
- if __experimental_stream_steps is not None:
2301
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2302
- research_params = DeepResearchParams(**research_params)
2303
-
2304
- response = self.async_deep_research(
2305
- query,
2306
- max_depth=max_depth,
2307
- time_limit=time_limit,
2308
- max_urls=max_urls,
2309
- analysis_prompt=analysis_prompt,
2310
- system_prompt=system_prompt
2311
- )
2312
- if not response.get('success') or 'id' not in response:
2313
- return response
2314
-
2315
- job_id = response['id']
2316
- last_activity_count = 0
2317
- last_source_count = 0
2318
-
2319
- while True:
2320
- status = self.check_deep_research_status(job_id)
2321
-
2322
- if on_activity and 'activities' in status:
2323
- new_activities = status['activities'][last_activity_count:]
2324
- for activity in new_activities:
2325
- on_activity(activity)
2326
- last_activity_count = len(status['activities'])
2327
-
2328
- if on_source and 'sources' in status:
2329
- new_sources = status['sources'][last_source_count:]
2330
- for source in new_sources:
2331
- on_source(source)
2332
- last_source_count = len(status['sources'])
2333
-
2334
- if status['status'] == 'completed':
2335
- return status
2336
- elif status['status'] == 'failed':
2337
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
2338
- elif status['status'] != 'processing':
2339
- break
2340
-
2341
- time.sleep(2) # Polling interval
2342
-
2343
- return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2344
-
2345
- def async_deep_research(
2346
- self,
2347
- query: str,
2348
- *,
2349
- max_depth: Optional[int] = None,
2350
- time_limit: Optional[int] = None,
2351
- max_urls: Optional[int] = None,
2352
- analysis_prompt: Optional[str] = None,
2353
- system_prompt: Optional[str] = None,
2354
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2355
- """
2356
- Initiates an asynchronous deep research operation.
2357
-
2358
- Args:
2359
- query (str): Research query or topic to investigate
2360
- max_depth (Optional[int]): Maximum depth of research exploration
2361
- time_limit (Optional[int]): Time limit in seconds for research
2362
- max_urls (Optional[int]): Maximum number of URLs to process
2363
- analysis_prompt (Optional[str]): Custom prompt for analysis
2364
- system_prompt (Optional[str]): Custom system prompt
2365
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2366
-
2367
- Returns:
2368
- Dict[str, Any]: A response containing:
2369
- * success (bool): Whether the research initiation was successful
2370
- * id (str): The unique identifier for the research job
2371
- * error (str, optional): Error message if initiation failed
2372
-
2373
- Raises:
2374
- Exception: If the research initiation fails.
2375
- """
2376
- research_params = {}
2377
- if max_depth is not None:
2378
- research_params['maxDepth'] = max_depth
2379
- if time_limit is not None:
2380
- research_params['timeLimit'] = time_limit
2381
- if max_urls is not None:
2382
- research_params['maxUrls'] = max_urls
2383
- if analysis_prompt is not None:
2384
- research_params['analysisPrompt'] = analysis_prompt
2385
- if system_prompt is not None:
2386
- research_params['systemPrompt'] = system_prompt
2387
- if __experimental_stream_steps is not None:
2388
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2389
- research_params = DeepResearchParams(**research_params)
2390
-
2391
- headers = self._prepare_headers()
2392
-
2393
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
2394
- json_data['origin'] = f"python-sdk@{version}"
2395
-
2396
- # Handle json options schema if present
2397
- if 'jsonOptions' in json_data:
2398
- json_opts = json_data['jsonOptions']
2399
- if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2400
- json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2401
-
2402
- try:
2403
- response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2404
- if response.status_code == 200:
2405
- try:
2406
- return response.json()
2407
- except:
2408
- raise Exception('Failed to parse Firecrawl response as JSON.')
2409
- else:
2410
- self._handle_error(response, 'start deep research')
2411
- except Exception as e:
2412
- raise ValueError(str(e))
2413
-
2414
- return {'success': False, 'error': 'Internal server error'}
2415
-
2416
- def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2417
- """
2418
- Check the status of a deep research operation.
2419
-
2420
- Args:
2421
- id (str): The ID of the deep research operation.
2422
-
2423
- Returns:
2424
- DeepResearchResponse containing:
2425
-
2426
- Status:
2427
- * success - Whether research completed successfully
2428
- * status - Current state (processing/completed/failed)
2429
- * error - Error message if failed
2430
-
2431
- Results:
2432
- * id - Unique identifier for the research job
2433
- * data - Research findings and analysis
2434
- * sources - List of discovered sources
2435
- * activities - Research progress log
2436
- * summaries - Generated research summaries
2437
-
2438
- Raises:
2439
- Exception: If the status check fails.
2440
- """
2441
- headers = self._prepare_headers()
2442
- try:
2443
- response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2444
- if response.status_code == 200:
2445
- try:
2446
- return response.json()
2447
- except:
2448
- raise Exception('Failed to parse Firecrawl response as JSON.')
2449
- elif response.status_code == 404:
2450
- raise Exception('Deep research job not found')
2451
- else:
2452
- self._handle_error(response, 'check deep research status')
2453
- except Exception as e:
2454
- raise ValueError(str(e))
2455
-
2456
- return {'success': False, 'error': 'Internal server error'}
2457
-
2458
- def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2459
- """
2460
- Validate additional keyword arguments before they are passed to the API.
2461
- This provides early validation before the Pydantic model validation.
2462
-
2463
- Args:
2464
- kwargs (Dict[str, Any]): Additional keyword arguments to validate
2465
- method_name (str): Name of the method these kwargs are for
2466
-
2467
- Raises:
2468
- ValueError: If kwargs contain invalid or unsupported parameters
2469
- """
2470
- if not kwargs:
2471
- return
2472
-
2473
- # Known parameter mappings for each method
2474
- method_params = {
2475
- "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2476
- "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2477
- "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
2478
- "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2479
- "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2480
- "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2481
- "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2482
- "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2483
- "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2484
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2485
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2486
- "actions", "agent", "webhook"},
2487
- "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2488
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2489
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2490
- "actions", "agent", "webhook"},
2491
- "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2492
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2493
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2494
- "actions", "agent", "webhook"}
2495
- }
2496
-
2497
- # Get allowed parameters for this method
2498
- allowed_params = method_params.get(method_name, set())
2499
-
2500
- # Check for unknown parameters
2501
- unknown_params = set(kwargs.keys()) - allowed_params
2502
- if unknown_params:
2503
- raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2504
-
2505
- # Additional type validation can be added here if needed
2506
- # For now, we rely on Pydantic models for detailed type validation
2507
-
2508
- def _ensure_schema_dict(self, schema):
2509
- """
2510
- Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2511
- """
2512
- if schema is None:
2513
- return schema
2514
- if isinstance(schema, type):
2515
- # Pydantic v1/v2 model class
2516
- if hasattr(schema, 'model_json_schema'):
2517
- return schema.model_json_schema()
2518
- elif hasattr(schema, 'schema'):
2519
- return schema.schema()
2520
- if isinstance(schema, dict):
2521
- return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2522
- if isinstance(schema, (list, tuple)):
2523
- return [self._ensure_schema_dict(v) for v in schema]
2524
- return schema
2525
-
2526
- class CrawlWatcher:
2527
- """
2528
- A class to watch and handle crawl job events via WebSocket connection.
2529
-
2530
- Attributes:
2531
- id (str): The ID of the crawl job to watch
2532
- app (FirecrawlApp): The FirecrawlApp instance
2533
- data (List[Dict[str, Any]]): List of crawled documents/data
2534
- status (str): Current status of the crawl job
2535
- ws_url (str): WebSocket URL for the crawl job
2536
- event_handlers (dict): Dictionary of event type to list of handler functions
2537
- """
2538
- def __init__(self, id: str, app: FirecrawlApp):
2539
- self.id = id
2540
- self.app = app
2541
- self.data: List[Dict[str, Any]] = []
2542
- self.status = "scraping"
2543
- self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2544
- self.event_handlers = {
2545
- 'done': [],
2546
- 'error': [],
2547
- 'document': []
2548
- }
2549
-
2550
- async def connect(self) -> None:
2551
- """
2552
- Establishes WebSocket connection and starts listening for messages.
2553
- """
2554
- async with websockets.connect(
2555
- self.ws_url,
2556
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2557
- ) as websocket:
2558
- await self._listen(websocket)
2559
-
2560
- async def _listen(self, websocket) -> None:
2561
- """
2562
- Listens for incoming WebSocket messages and handles them.
2563
-
2564
- Args:
2565
- websocket: The WebSocket connection object
2566
- """
2567
- async for message in websocket:
2568
- msg = json.loads(message)
2569
- await self._handle_message(msg)
2570
-
2571
- def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2572
- """
2573
- Adds an event handler function for a specific event type.
2574
-
2575
- Args:
2576
- event_type (str): Type of event to listen for ('done', 'error', or 'document')
2577
- handler (Callable): Function to handle the event
2578
- """
2579
- if event_type in self.event_handlers:
2580
- self.event_handlers[event_type].append(handler)
2581
-
2582
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2583
- """
2584
- Dispatches an event to all registered handlers for that event type.
2585
-
2586
- Args:
2587
- event_type (str): Type of event to dispatch
2588
- detail (Dict[str, Any]): Event details/data to pass to handlers
2589
- """
2590
- if event_type in self.event_handlers:
2591
- for handler in self.event_handlers[event_type]:
2592
- handler(detail)
2593
-
2594
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
2595
- """
2596
- Handles incoming WebSocket messages based on their type.
2597
-
2598
- Args:
2599
- msg (Dict[str, Any]): The message to handle
2600
- """
2601
- if msg['type'] == 'done':
2602
- self.status = 'completed'
2603
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2604
- elif msg['type'] == 'error':
2605
- self.status = 'failed'
2606
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2607
- elif msg['type'] == 'catchup':
2608
- self.status = msg['data']['status']
2609
- self.data.extend(msg['data'].get('data', []))
2610
- for doc in self.data:
2611
- self.dispatch_event('document', {'data': doc, 'id': self.id})
2612
- elif msg['type'] == 'document':
2613
- self.data.append(msg['data'])
2614
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2615
-
2616
- class AsyncFirecrawlApp(FirecrawlApp):
2617
- """
2618
- Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2619
- Provides non-blocking alternatives to all FirecrawlApp operations.
2620
- """
2621
-
2622
- async def _async_request(
2623
- self,
2624
- method: str,
2625
- url: str,
2626
- headers: Dict[str, str],
2627
- data: Optional[Dict[str, Any]] = None,
2628
- retries: int = 3,
2629
- backoff_factor: float = 0.5) -> Dict[str, Any]:
2630
- """
2631
- Generic async request method with exponential backoff retry logic.
2632
-
2633
- Args:
2634
- method (str): The HTTP method to use (e.g., "GET" or "POST").
2635
- url (str): The URL to send the request to.
2636
- headers (Dict[str, str]): Headers to include in the request.
2637
- data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2638
- retries (int): Maximum number of retry attempts (default: 3).
2639
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2640
- Delay will be backoff_factor * (2 ** retry_count).
2641
-
2642
- Returns:
2643
- Dict[str, Any]: The parsed JSON response from the server.
2644
-
2645
- Raises:
2646
- aiohttp.ClientError: If the request fails after all retries.
2647
- Exception: If max retries are exceeded or other errors occur.
2648
- """
2649
- async with aiohttp.ClientSession() as session:
2650
- for attempt in range(retries):
2651
- try:
2652
- async with session.request(
2653
- method=method, url=url, headers=headers, json=data
2654
- ) as response:
2655
- if response.status == 502:
2656
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2657
- continue
2658
- if response.status >= 300:
2659
- await self._handle_error(response, f"make {method} request")
2660
- return await response.json()
2661
- except aiohttp.ClientError as e:
2662
- if attempt == retries - 1:
2663
- raise e
2664
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2665
- raise Exception("Max retries exceeded")
2666
-
2667
- async def _async_post_request(
2668
- self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2669
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2670
- """
2671
- Make an async POST request with exponential backoff retry logic.
2672
-
2673
- Args:
2674
- url (str): The URL to send the POST request to.
2675
- data (Dict[str, Any]): The JSON data to include in the request body.
2676
- headers (Dict[str, str]): Headers to include in the request.
2677
- retries (int): Maximum number of retry attempts (default: 3).
2678
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2679
- Delay will be backoff_factor * (2 ** retry_count).
2680
-
2681
- Returns:
2682
- Dict[str, Any]: The parsed JSON response from the server.
2683
-
2684
- Raises:
2685
- aiohttp.ClientError: If the request fails after all retries.
2686
- Exception: If max retries are exceeded or other errors occur.
2687
- """
2688
- return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2689
-
2690
- async def _async_get_request(
2691
- self, url: str, headers: Dict[str, str],
2692
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2693
- """
2694
- Make an async GET request with exponential backoff retry logic.
2695
-
2696
- Args:
2697
- url (str): The URL to send the GET request to.
2698
- headers (Dict[str, str]): Headers to include in the request.
2699
- retries (int): Maximum number of retry attempts (default: 3).
2700
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2701
- Delay will be backoff_factor * (2 ** retry_count).
2702
-
2703
- Returns:
2704
- Dict[str, Any]: The parsed JSON response from the server.
2705
-
2706
- Raises:
2707
- aiohttp.ClientError: If the request fails after all retries.
2708
- Exception: If max retries are exceeded or other errors occur.
2709
- """
2710
- return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2711
-
2712
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2713
- """
2714
- Handle errors from async API responses with detailed error messages.
2715
-
2716
- Args:
2717
- response (aiohttp.ClientResponse): The response object from the failed request
2718
- action (str): Description of the action that was being attempted
2719
-
2720
- Raises:
2721
- aiohttp.ClientError: With a detailed error message based on the response status:
2722
- - 402: Payment Required
2723
- - 408: Request Timeout
2724
- - 409: Conflict
2725
- - 500: Internal Server Error
2726
- - Other: Unexpected error with status code
2727
- """
2728
- try:
2729
- error_data = await response.json()
2730
- error_message = error_data.get('error', 'No error message provided.')
2731
- error_details = error_data.get('details', 'No additional error details provided.')
2732
- except:
2733
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2734
-
2735
- message = await self._get_async_error_message(response.status, action, error_message, error_details)
2736
-
2737
- raise aiohttp.ClientError(message)
2738
-
2739
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2740
- """
2741
- Generate a standardized error message based on HTTP status code for async operations.
2742
-
2743
- Args:
2744
- status_code (int): The HTTP status code from the response
2745
- action (str): Description of the action that was being performed
2746
- error_message (str): The error message from the API response
2747
- error_details (str): Additional error details from the API response
2748
-
2749
- Returns:
2750
- str: A formatted error message
2751
- """
2752
- return self._get_error_message(status_code, action, error_message, error_details)
2753
-
2754
- async def crawl_url_and_watch(
2755
- self,
2756
- url: str,
2757
- params: Optional[CrawlParams] = None,
2758
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2759
- """
2760
- Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2761
-
2762
- Args:
2763
- url (str): Target URL to start crawling from
2764
- params (Optional[CrawlParams]): See CrawlParams model for configuration:
2765
- URL Discovery:
2766
- * includePaths - Patterns of URLs to include
2767
- * excludePaths - Patterns of URLs to exclude
2768
- * maxDepth - Maximum crawl depth
2769
- * maxDiscoveryDepth - Maximum depth for finding new URLs
2770
- * limit - Maximum pages to crawl
2771
-
2772
- Link Following:
2773
- * allowBackwardLinks - Follow parent directory links
2774
- * allowExternalLinks - Follow external domain links
2775
- * ignoreSitemap - Skip sitemap.xml processing
2776
-
2777
- Advanced:
2778
- * scrapeOptions - Page scraping configuration
2779
- * webhook - Notification webhook settings
2780
- * deduplicateSimilarURLs - Remove similar URLs
2781
- * ignoreQueryParameters - Ignore URL parameters
2782
- * regexOnFullURL - Apply regex to full URLs
2783
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2784
-
2785
- Returns:
2786
- AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2787
-
2788
- Raises:
2789
- Exception: If crawl job fails to start
2790
- """
2791
- crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2792
- if crawl_response.get('success') and 'id' in crawl_response:
2793
- return AsyncCrawlWatcher(crawl_response['id'], self)
2794
- else:
2795
- raise Exception("Crawl job failed to start")
2796
-
2797
- async def batch_scrape_urls_and_watch(
2798
- self,
2799
- urls: List[str],
2800
- params: Optional[ScrapeParams] = None,
2801
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2802
- """
2803
- Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2804
-
2805
- Args:
2806
- urls (List[str]): List of URLs to scrape
2807
- params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2808
-
2809
- Content Options:
2810
- * formats - Content formats to retrieve
2811
- * includeTags - HTML tags to include
2812
- * excludeTags - HTML tags to exclude
2813
- * onlyMainContent - Extract main content only
2814
-
2815
- Request Options:
2816
- * headers - Custom HTTP headers
2817
- * timeout - Request timeout (ms)
2818
- * mobile - Use mobile user agent
2819
- * proxy - Proxy type
2820
-
2821
- Extraction Options:
2822
- * extract - Content extraction config
2823
- * jsonOptions - JSON extraction config
2824
- * actions - Actions to perform
2825
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2826
-
2827
- Returns:
2828
- AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2829
-
2830
- Raises:
2831
- Exception: If batch scrape job fails to start
2832
- """
2833
- batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2834
- if batch_response.get('success') and 'id' in batch_response:
2835
- return AsyncCrawlWatcher(batch_response['id'], self)
2836
- else:
2837
- raise Exception("Batch scrape job failed to start")
2838
-
2839
- async def scrape_url(
2840
- self,
2841
- url: str,
2842
- *,
2843
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2844
- include_tags: Optional[List[str]] = None,
2845
- exclude_tags: Optional[List[str]] = None,
2846
- only_main_content: Optional[bool] = None,
2847
- wait_for: Optional[int] = None,
2848
- timeout: Optional[int] = None,
2849
- location: Optional[LocationConfig] = None,
2850
- mobile: Optional[bool] = None,
2851
- skip_tls_verification: Optional[bool] = None,
2852
- remove_base64_images: Optional[bool] = None,
2853
- block_ads: Optional[bool] = None,
2854
- proxy: Optional[Literal["basic", "stealth"]] = None,
2855
- extract: Optional[JsonConfig] = None,
2856
- json_options: Optional[JsonConfig] = None,
2857
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2858
- **kwargs) -> ScrapeResponse[Any]:
2859
- """
2860
- Scrape a single URL asynchronously.
2861
-
2862
- Args:
2863
- url (str): Target URL to scrape
2864
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2865
- include_tags (Optional[List[str]]): HTML tags to include
2866
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2867
- only_main_content (Optional[bool]): Extract main content only
2868
- wait_for (Optional[int]): Wait for a specific element to appear
2869
- timeout (Optional[int]): Request timeout (ms)
2870
- location (Optional[LocationConfig]): Location configuration
2871
- mobile (Optional[bool]): Use mobile user agent
2872
- skip_tls_verification (Optional[bool]): Skip TLS verification
2873
- remove_base64_images (Optional[bool]): Remove base64 images
2874
- block_ads (Optional[bool]): Block ads
2875
- proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2876
- extract (Optional[JsonConfig]): Content extraction settings
2877
- json_options (Optional[JsonConfig]): JSON extraction settings
2878
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2879
- **kwargs: Additional parameters to pass to the API
2880
-
2881
- Returns:
2882
- ScrapeResponse with:
2883
- * success - Whether scrape was successful
2884
- * markdown - Markdown content if requested
2885
- * html - HTML content if requested
2886
- * rawHtml - Raw HTML content if requested
2887
- * links - Extracted links if requested
2888
- * screenshot - Screenshot if requested
2889
- * extract - Extracted data if requested
2890
- * json - JSON data if requested
2891
- * error - Error message if scrape failed
2892
-
2893
- Raises:
2894
- Exception: If scraping fails
2895
- """
2896
- # Validate any additional kwargs
2897
- self._validate_kwargs(kwargs, "scrape_url")
2898
-
2899
- headers = self._prepare_headers()
2900
-
2901
- # Build scrape parameters
2902
- scrape_params = {
2903
- 'url': url,
2904
- 'origin': f"python-sdk@{version}"
2905
- }
2906
-
2907
- # Add optional parameters if provided and not None
2908
- if formats:
2909
- scrape_params['formats'] = formats
2910
- if include_tags:
2911
- scrape_params['includeTags'] = include_tags
2912
- if exclude_tags:
2913
- scrape_params['excludeTags'] = exclude_tags
2914
- if only_main_content is not None:
2915
- scrape_params['onlyMainContent'] = only_main_content
2916
- if wait_for:
2917
- scrape_params['waitFor'] = wait_for
2918
- if timeout:
2919
- scrape_params['timeout'] = timeout
2920
- if location:
2921
- scrape_params['location'] = location.dict(exclude_none=True)
2922
- if mobile is not None:
2923
- scrape_params['mobile'] = mobile
2924
- if skip_tls_verification is not None:
2925
- scrape_params['skipTlsVerification'] = skip_tls_verification
2926
- if remove_base64_images is not None:
2927
- scrape_params['removeBase64Images'] = remove_base64_images
2928
- if block_ads is not None:
2929
- scrape_params['blockAds'] = block_ads
2930
- if proxy:
2931
- scrape_params['proxy'] = proxy
2932
- if extract is not None:
2933
- extract = self._ensure_schema_dict(extract)
2934
- if isinstance(extract, dict) and "schema" in extract:
2935
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
2936
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
2937
- if json_options is not None:
2938
- json_options = self._ensure_schema_dict(json_options)
2939
- if isinstance(json_options, dict) and "schema" in json_options:
2940
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
2941
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
2942
- if actions:
2943
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
2944
-
2945
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
2946
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
2947
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
2948
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
2949
-
2950
- # Make async request
2951
- endpoint = f'/v1/scrape'
2952
- response = await self._async_post_request(
2953
- f'{self.api_url}{endpoint}',
2954
- scrape_params,
2955
- headers
2956
- )
2957
-
2958
- if response.get('success') and 'data' in response:
2959
- return ScrapeResponse(**response['data'])
2960
- elif "error" in response:
2961
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2962
- else:
2963
- # Use the response content directly if possible, otherwise a generic message
2964
- error_content = response.get('error', str(response))
2965
- raise Exception(f'Failed to scrape URL. Error: {error_content}')
2966
-
2967
- async def batch_scrape_urls(
2968
- self,
2969
- urls: List[str],
2970
- *,
2971
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2972
- headers: Optional[Dict[str, str]] = None,
2973
- include_tags: Optional[List[str]] = None,
2974
- exclude_tags: Optional[List[str]] = None,
2975
- only_main_content: Optional[bool] = None,
2976
- wait_for: Optional[int] = None,
2977
- timeout: Optional[int] = None,
2978
- location: Optional[LocationConfig] = None,
2979
- mobile: Optional[bool] = None,
2980
- skip_tls_verification: Optional[bool] = None,
2981
- remove_base64_images: Optional[bool] = None,
2982
- block_ads: Optional[bool] = None,
2983
- proxy: Optional[Literal["basic", "stealth"]] = None,
2984
- extract: Optional[JsonConfig] = None,
2985
- json_options: Optional[JsonConfig] = None,
2986
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2987
- agent: Optional[AgentOptions] = None,
2988
- poll_interval: Optional[int] = 2,
2989
- idempotency_key: Optional[str] = None,
2990
- **kwargs
2991
- ) -> BatchScrapeStatusResponse:
2992
- """
2993
- Asynchronously scrape multiple URLs and monitor until completion.
2994
-
2995
- Args:
2996
- urls (List[str]): URLs to scrape
2997
- formats (Optional[List[Literal]]): Content formats to retrieve
2998
- headers (Optional[Dict[str, str]]): Custom HTTP headers
2999
- include_tags (Optional[List[str]]): HTML tags to include
3000
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3001
- only_main_content (Optional[bool]): Extract main content only
3002
- wait_for (Optional[int]): Wait time in milliseconds
3003
- timeout (Optional[int]): Request timeout in milliseconds
3004
- location (Optional[LocationConfig]): Location configuration
3005
- mobile (Optional[bool]): Use mobile user agent
3006
- skip_tls_verification (Optional[bool]): Skip TLS verification
3007
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3008
- block_ads (Optional[bool]): Block advertisements
3009
- proxy (Optional[Literal]): Proxy type to use
3010
- extract (Optional[JsonConfig]): Content extraction config
3011
- json_options (Optional[JsonConfig]): JSON extraction config
3012
- actions (Optional[List[Union]]): Actions to perform
3013
- agent (Optional[AgentOptions]): Agent configuration
3014
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3015
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3016
- **kwargs: Additional parameters to pass to the API
3017
-
3018
- Returns:
3019
- BatchScrapeStatusResponse with:
3020
- * Scraping status and progress
3021
- * Scraped content for each URL
3022
- * Success/error information
3023
-
3024
- Raises:
3025
- Exception: If batch scrape fails
3026
- """
3027
- # Validate any additional kwargs
3028
- self._validate_kwargs(kwargs, "batch_scrape_urls")
3029
-
3030
- scrape_params = {}
3031
-
3032
- # Add individual parameters
3033
- if formats is not None:
3034
- scrape_params['formats'] = formats
3035
- if headers is not None:
3036
- scrape_params['headers'] = headers
3037
- if include_tags is not None:
3038
- scrape_params['includeTags'] = include_tags
3039
- if exclude_tags is not None:
3040
- scrape_params['excludeTags'] = exclude_tags
3041
- if only_main_content is not None:
3042
- scrape_params['onlyMainContent'] = only_main_content
3043
- if wait_for is not None:
3044
- scrape_params['waitFor'] = wait_for
3045
- if timeout is not None:
3046
- scrape_params['timeout'] = timeout
3047
- if location is not None:
3048
- scrape_params['location'] = location.dict(exclude_none=True)
3049
- if mobile is not None:
3050
- scrape_params['mobile'] = mobile
3051
- if skip_tls_verification is not None:
3052
- scrape_params['skipTlsVerification'] = skip_tls_verification
3053
- if remove_base64_images is not None:
3054
- scrape_params['removeBase64Images'] = remove_base64_images
3055
- if block_ads is not None:
3056
- scrape_params['blockAds'] = block_ads
3057
- if proxy is not None:
3058
- scrape_params['proxy'] = proxy
3059
- if extract is not None:
3060
- extract = self._ensure_schema_dict(extract)
3061
- if isinstance(extract, dict) and "schema" in extract:
3062
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3063
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3064
- if json_options is not None:
3065
- json_options = self._ensure_schema_dict(json_options)
3066
- if isinstance(json_options, dict) and "schema" in json_options:
3067
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3068
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3069
- if actions is not None:
3070
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3071
- if agent is not None:
3072
- scrape_params['agent'] = agent.dict(exclude_none=True)
3073
-
3074
- # Add any additional kwargs
3075
- scrape_params.update(kwargs)
3076
-
3077
- # Create final params object
3078
- final_params = ScrapeParams(**scrape_params)
3079
- params_dict = final_params.dict(exclude_none=True)
3080
- params_dict['urls'] = urls
3081
- params_dict['origin'] = f"python-sdk@{version}"
3082
-
3083
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3084
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3085
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3086
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3087
-
3088
- # Make request
3089
- headers = self._prepare_headers(idempotency_key)
3090
- response = await self._async_post_request(
3091
- f'{self.api_url}/v1/batch/scrape',
3092
- params_dict,
3093
- headers
3094
- )
3095
-
3096
- if response.get('success'):
3097
- try:
3098
- id = response.get('id')
3099
- except:
3100
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3101
- return await self._async_monitor_job_status(id, headers, poll_interval)
3102
- else:
3103
- self._handle_error(response, 'start batch scrape job')
3104
-
3105
-
3106
- async def async_batch_scrape_urls(
3107
- self,
3108
- urls: List[str],
3109
- *,
3110
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3111
- headers: Optional[Dict[str, str]] = None,
3112
- include_tags: Optional[List[str]] = None,
3113
- exclude_tags: Optional[List[str]] = None,
3114
- only_main_content: Optional[bool] = None,
3115
- wait_for: Optional[int] = None,
3116
- timeout: Optional[int] = None,
3117
- location: Optional[LocationConfig] = None,
3118
- mobile: Optional[bool] = None,
3119
- skip_tls_verification: Optional[bool] = None,
3120
- remove_base64_images: Optional[bool] = None,
3121
- block_ads: Optional[bool] = None,
3122
- proxy: Optional[Literal["basic", "stealth"]] = None,
3123
- extract: Optional[JsonConfig] = None,
3124
- json_options: Optional[JsonConfig] = None,
3125
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3126
- agent: Optional[AgentOptions] = None,
3127
- idempotency_key: Optional[str] = None,
3128
- **kwargs
3129
- ) -> BatchScrapeResponse:
3130
- """
3131
- Initiate a batch scrape job asynchronously.
3132
-
3133
- Args:
3134
- urls (List[str]): URLs to scrape
3135
- formats (Optional[List[Literal]]): Content formats to retrieve
3136
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3137
- include_tags (Optional[List[str]]): HTML tags to include
3138
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3139
- only_main_content (Optional[bool]): Extract main content only
3140
- wait_for (Optional[int]): Wait time in milliseconds
3141
- timeout (Optional[int]): Request timeout in milliseconds
3142
- location (Optional[LocationConfig]): Location configuration
3143
- mobile (Optional[bool]): Use mobile user agent
3144
- skip_tls_verification (Optional[bool]): Skip TLS verification
3145
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3146
- block_ads (Optional[bool]): Block advertisements
3147
- proxy (Optional[Literal]): Proxy type to use
3148
- extract (Optional[JsonConfig]): Content extraction config
3149
- json_options (Optional[JsonConfig]): JSON extraction config
3150
- actions (Optional[List[Union]]): Actions to perform
3151
- agent (Optional[AgentOptions]): Agent configuration
3152
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3153
- **kwargs: Additional parameters to pass to the API
3154
-
3155
- Returns:
3156
- BatchScrapeResponse with:
3157
- * success - Whether job started successfully
3158
- * id - Unique identifier for the job
3159
- * url - Status check URL
3160
- * error - Error message if start failed
3161
-
3162
- Raises:
3163
- Exception: If job initiation fails
3164
- """
3165
- # Validate any additional kwargs
3166
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3167
-
3168
- scrape_params = {}
3169
-
3170
- # Add individual parameters
3171
- if formats is not None:
3172
- scrape_params['formats'] = formats
3173
- if headers is not None:
3174
- scrape_params['headers'] = headers
3175
- if include_tags is not None:
3176
- scrape_params['includeTags'] = include_tags
3177
- if exclude_tags is not None:
3178
- scrape_params['excludeTags'] = exclude_tags
3179
- if only_main_content is not None:
3180
- scrape_params['onlyMainContent'] = only_main_content
3181
- if wait_for is not None:
3182
- scrape_params['waitFor'] = wait_for
3183
- if timeout is not None:
3184
- scrape_params['timeout'] = timeout
3185
- if location is not None:
3186
- scrape_params['location'] = location.dict(exclude_none=True)
3187
- if mobile is not None:
3188
- scrape_params['mobile'] = mobile
3189
- if skip_tls_verification is not None:
3190
- scrape_params['skipTlsVerification'] = skip_tls_verification
3191
- if remove_base64_images is not None:
3192
- scrape_params['removeBase64Images'] = remove_base64_images
3193
- if block_ads is not None:
3194
- scrape_params['blockAds'] = block_ads
3195
- if proxy is not None:
3196
- scrape_params['proxy'] = proxy
3197
- if extract is not None:
3198
- extract = self._ensure_schema_dict(extract)
3199
- if isinstance(extract, dict) and "schema" in extract:
3200
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3201
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3202
- if json_options is not None:
3203
- json_options = self._ensure_schema_dict(json_options)
3204
- if isinstance(json_options, dict) and "schema" in json_options:
3205
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3206
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3207
- if actions is not None:
3208
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3209
- if agent is not None:
3210
- scrape_params['agent'] = agent.dict(exclude_none=True)
3211
-
3212
- # Add any additional kwargs
3213
- scrape_params.update(kwargs)
3214
-
3215
- # Create final params object
3216
- final_params = ScrapeParams(**scrape_params)
3217
- params_dict = final_params.dict(exclude_none=True)
3218
- params_dict['urls'] = urls
3219
- params_dict['origin'] = f"python-sdk@{version}"
3220
-
3221
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3222
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3223
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3224
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3225
-
3226
- # Make request
3227
- headers = self._prepare_headers(idempotency_key)
3228
- response = await self._async_post_request(
3229
- f'{self.api_url}/v1/batch/scrape',
3230
- params_dict,
3231
- headers
3232
- )
3233
-
3234
- if response.get('status_code') == 200:
3235
- try:
3236
- return BatchScrapeResponse(**response.json())
3237
- except:
3238
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3239
- else:
3240
- self._handle_error(response, 'start batch scrape job')
3241
-
3242
- async def crawl_url(
3243
- self,
3244
- url: str,
3245
- *,
3246
- include_paths: Optional[List[str]] = None,
3247
- exclude_paths: Optional[List[str]] = None,
3248
- max_depth: Optional[int] = None,
3249
- max_discovery_depth: Optional[int] = None,
3250
- limit: Optional[int] = None,
3251
- allow_backward_links: Optional[bool] = None,
3252
- allow_external_links: Optional[bool] = None,
3253
- ignore_sitemap: Optional[bool] = None,
3254
- scrape_options: Optional[ScrapeOptions] = None,
3255
- webhook: Optional[Union[str, WebhookConfig]] = None,
3256
- deduplicate_similar_urls: Optional[bool] = None,
3257
- ignore_query_parameters: Optional[bool] = None,
3258
- regex_on_full_url: Optional[bool] = None,
3259
- delay: Optional[int] = None,
3260
- poll_interval: Optional[int] = 2,
3261
- idempotency_key: Optional[str] = None,
3262
- **kwargs
3263
- ) -> CrawlStatusResponse:
3264
- """
3265
- Crawl a website starting from a URL.
3266
-
3267
- Args:
3268
- url (str): Target URL to start crawling from
3269
- include_paths (Optional[List[str]]): Patterns of URLs to include
3270
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3271
- max_depth (Optional[int]): Maximum crawl depth
3272
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3273
- limit (Optional[int]): Maximum pages to crawl
3274
- allow_backward_links (Optional[bool]): Follow parent directory links
3275
- allow_external_links (Optional[bool]): Follow external domain links
3276
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3277
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3278
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3279
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3280
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3281
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3282
- delay (Optional[int]): Delay in seconds between scrapes
3283
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3284
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3285
- **kwargs: Additional parameters to pass to the API
3286
-
3287
- Returns:
3288
- CrawlStatusResponse with:
3289
- * Crawling status and progress
3290
- * Crawled page contents
3291
- * Success/error information
3292
-
3293
- Raises:
3294
- Exception: If crawl fails
3295
- """
3296
- # Validate any additional kwargs
3297
- self._validate_kwargs(kwargs, "crawl_url")
3298
-
3299
- crawl_params = {}
3300
-
3301
- # Add individual parameters
3302
- if include_paths is not None:
3303
- crawl_params['includePaths'] = include_paths
3304
- if exclude_paths is not None:
3305
- crawl_params['excludePaths'] = exclude_paths
3306
- if max_depth is not None:
3307
- crawl_params['maxDepth'] = max_depth
3308
- if max_discovery_depth is not None:
3309
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3310
- if limit is not None:
3311
- crawl_params['limit'] = limit
3312
- if allow_backward_links is not None:
3313
- crawl_params['allowBackwardLinks'] = allow_backward_links
3314
- if allow_external_links is not None:
3315
- crawl_params['allowExternalLinks'] = allow_external_links
3316
- if ignore_sitemap is not None:
3317
- crawl_params['ignoreSitemap'] = ignore_sitemap
3318
- if scrape_options is not None:
3319
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3320
- if webhook is not None:
3321
- crawl_params['webhook'] = webhook
3322
- if deduplicate_similar_urls is not None:
3323
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3324
- if ignore_query_parameters is not None:
3325
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3326
- if regex_on_full_url is not None:
3327
- crawl_params['regexOnFullURL'] = regex_on_full_url
3328
- if delay is not None:
3329
- crawl_params['delay'] = delay
3330
-
3331
- # Add any additional kwargs
3332
- crawl_params.update(kwargs)
3333
-
3334
- # Create final params object
3335
- final_params = CrawlParams(**crawl_params)
3336
- params_dict = final_params.dict(exclude_none=True)
3337
- params_dict['url'] = url
3338
- params_dict['origin'] = f"python-sdk@{version}"
3339
- # Make request
3340
- headers = self._prepare_headers(idempotency_key)
3341
- response = await self._async_post_request(
3342
- f'{self.api_url}/v1/crawl', params_dict, headers)
3343
-
3344
- if response.get('success'):
3345
- try:
3346
- id = response.get('id')
3347
- except:
3348
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3349
- return await self._async_monitor_job_status(id, headers, poll_interval)
3350
- else:
3351
- self._handle_error(response, 'start crawl job')
3352
-
3353
-
3354
- async def async_crawl_url(
3355
- self,
3356
- url: str,
3357
- *,
3358
- include_paths: Optional[List[str]] = None,
3359
- exclude_paths: Optional[List[str]] = None,
3360
- max_depth: Optional[int] = None,
3361
- max_discovery_depth: Optional[int] = None,
3362
- limit: Optional[int] = None,
3363
- allow_backward_links: Optional[bool] = None,
3364
- allow_external_links: Optional[bool] = None,
3365
- ignore_sitemap: Optional[bool] = None,
3366
- scrape_options: Optional[ScrapeOptions] = None,
3367
- webhook: Optional[Union[str, WebhookConfig]] = None,
3368
- deduplicate_similar_urls: Optional[bool] = None,
3369
- ignore_query_parameters: Optional[bool] = None,
3370
- regex_on_full_url: Optional[bool] = None,
3371
- delay: Optional[int] = None,
3372
- poll_interval: Optional[int] = 2,
3373
- idempotency_key: Optional[str] = None,
3374
- **kwargs
3375
- ) -> CrawlResponse:
3376
- """
3377
- Start an asynchronous crawl job.
3378
-
3379
- Args:
3380
- url (str): Target URL to start crawling from
3381
- include_paths (Optional[List[str]]): Patterns of URLs to include
3382
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3383
- max_depth (Optional[int]): Maximum crawl depth
3384
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3385
- limit (Optional[int]): Maximum pages to crawl
3386
- allow_backward_links (Optional[bool]): Follow parent directory links
3387
- allow_external_links (Optional[bool]): Follow external domain links
3388
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3389
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3390
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3391
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3392
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3393
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3394
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3395
- **kwargs: Additional parameters to pass to the API
3396
-
3397
- Returns:
3398
- CrawlResponse with:
3399
- * success - Whether crawl started successfully
3400
- * id - Unique identifier for the crawl job
3401
- * url - Status check URL for the crawl
3402
- * error - Error message if start failed
3403
-
3404
- Raises:
3405
- Exception: If crawl initiation fails
3406
- """
3407
- crawl_params = {}
3408
-
3409
- # Add individual parameters
3410
- if include_paths is not None:
3411
- crawl_params['includePaths'] = include_paths
3412
- if exclude_paths is not None:
3413
- crawl_params['excludePaths'] = exclude_paths
3414
- if max_depth is not None:
3415
- crawl_params['maxDepth'] = max_depth
3416
- if max_discovery_depth is not None:
3417
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3418
- if limit is not None:
3419
- crawl_params['limit'] = limit
3420
- if allow_backward_links is not None:
3421
- crawl_params['allowBackwardLinks'] = allow_backward_links
3422
- if allow_external_links is not None:
3423
- crawl_params['allowExternalLinks'] = allow_external_links
3424
- if ignore_sitemap is not None:
3425
- crawl_params['ignoreSitemap'] = ignore_sitemap
3426
- if scrape_options is not None:
3427
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3428
- if webhook is not None:
3429
- crawl_params['webhook'] = webhook
3430
- if deduplicate_similar_urls is not None:
3431
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3432
- if ignore_query_parameters is not None:
3433
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3434
- if regex_on_full_url is not None:
3435
- crawl_params['regexOnFullURL'] = regex_on_full_url
3436
- if delay is not None:
3437
- crawl_params['delay'] = delay
3438
-
3439
- # Add any additional kwargs
3440
- crawl_params.update(kwargs)
3441
-
3442
- # Create final params object
3443
- final_params = CrawlParams(**crawl_params)
3444
- params_dict = final_params.dict(exclude_none=True)
3445
- params_dict['url'] = url
3446
- params_dict['origin'] = f"python-sdk@{version}"
3447
-
3448
- # Make request
3449
- headers = self._prepare_headers(idempotency_key)
3450
- response = await self._async_post_request(
3451
- f'{self.api_url}/v1/crawl',
3452
- params_dict,
3453
- headers
3454
- )
3455
-
3456
- if response.get('success'):
3457
- try:
3458
- return CrawlResponse(**response)
3459
- except:
3460
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3461
- else:
3462
- self._handle_error(response, 'start crawl job')
3463
-
3464
- async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3465
- """
3466
- Check the status and results of an asynchronous crawl job.
3467
-
3468
- Args:
3469
- id (str): Unique identifier for the crawl job
3470
-
3471
- Returns:
3472
- CrawlStatusResponse containing:
3473
- Status Information:
3474
- * status - Current state (scraping/completed/failed/cancelled)
3475
- * completed - Number of pages crawled
3476
- * total - Total pages to crawl
3477
- * creditsUsed - API credits consumed
3478
- * expiresAt - Data expiration timestamp
3479
-
3480
- Results:
3481
- * data - List of crawled documents
3482
- * next - URL for next page of results (if paginated)
3483
- * success - Whether status check succeeded
3484
- * error - Error message if failed
3485
-
3486
- Raises:
3487
- Exception: If status check fails
3488
- """
3489
- headers = self._prepare_headers()
3490
- endpoint = f'/v1/crawl/{id}'
3491
-
3492
- status_data = await self._async_get_request(
3493
- f'{self.api_url}{endpoint}',
3494
- headers
3495
- )
3496
-
3497
- if status_data.get('status') == 'completed':
3498
- if 'data' in status_data:
3499
- data = status_data['data']
3500
- while 'next' in status_data:
3501
- if len(status_data['data']) == 0:
3502
- break
3503
- next_url = status_data.get('next')
3504
- if not next_url:
3505
- logger.warning("Expected 'next' URL is missing.")
3506
- break
3507
- next_data = await self._async_get_request(next_url, headers)
3508
- data.extend(next_data.get('data', []))
3509
- status_data = next_data
3510
- status_data['data'] = data
3511
- # Create CrawlStatusResponse object from status data
3512
- response = CrawlStatusResponse(
3513
- status=status_data.get('status'),
3514
- total=status_data.get('total'),
3515
- completed=status_data.get('completed'),
3516
- creditsUsed=status_data.get('creditsUsed'),
3517
- expiresAt=status_data.get('expiresAt'),
3518
- data=status_data.get('data'),
3519
- success=False if 'error' in status_data else True
3520
- )
3521
-
3522
- if 'error' in status_data:
3523
- response.error = status_data.get('error')
3524
-
3525
- if 'next' in status_data:
3526
- response.next = status_data.get('next')
3527
-
3528
- return response
3529
-
3530
- async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3531
- """
3532
- Monitor the status of an asynchronous job until completion.
3533
-
3534
- Args:
3535
- id (str): The ID of the job to monitor
3536
- headers (Dict[str, str]): Headers to include in status check requests
3537
- poll_interval (int): Seconds between status checks (default: 2)
3538
-
3539
- Returns:
3540
- CrawlStatusResponse: The job results if completed successfully
3541
-
3542
- Raises:
3543
- Exception: If the job fails or an error occurs during status checks
3544
- """
3545
- while True:
3546
- status_data = await self._async_get_request(
3547
- f'{self.api_url}/v1/crawl/{id}',
3548
- headers
3549
- )
3550
-
3551
- if status_data.get('status') == 'completed':
3552
- if 'data' in status_data:
3553
- data = status_data['data']
3554
- while 'next' in status_data:
3555
- if len(status_data['data']) == 0:
3556
- break
3557
- next_url = status_data.get('next')
3558
- if not next_url:
3559
- logger.warning("Expected 'next' URL is missing.")
3560
- break
3561
- next_data = await self._async_get_request(next_url, headers)
3562
- data.extend(next_data.get('data', []))
3563
- status_data = next_data
3564
- status_data['data'] = data
3565
- return CrawlStatusResponse(**status_data)
3566
- else:
3567
- raise Exception('Job completed but no data was returned')
3568
- elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3569
- await asyncio.sleep(max(poll_interval, 2))
3570
- else:
3571
- raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3572
-
3573
- async def map_url(
3574
- self,
3575
- url: str,
3576
- *,
3577
- search: Optional[str] = None,
3578
- ignore_sitemap: Optional[bool] = None,
3579
- include_subdomains: Optional[bool] = None,
3580
- sitemap_only: Optional[bool] = None,
3581
- limit: Optional[int] = None,
3582
- timeout: Optional[int] = None,
3583
- params: Optional[MapParams] = None) -> MapResponse:
3584
- """
3585
- Asynchronously map and discover links from a URL.
3586
-
3587
- Args:
3588
- url (str): Target URL to map
3589
- params (Optional[MapParams]): See MapParams model:
3590
- Discovery Options:
3591
- * search - Filter pattern for URLs
3592
- * ignoreSitemap - Skip sitemap.xml
3593
- * includeSubdomains - Include subdomain links
3594
- * sitemapOnly - Only use sitemap.xml
3595
-
3596
- Limits:
3597
- * limit - Max URLs to return
3598
- * timeout - Request timeout (ms)
3599
-
3600
- Returns:
3601
- MapResponse with:
3602
- * Discovered URLs
3603
- * Success/error status
3604
-
3605
- Raises:
3606
- Exception: If mapping fails
3607
- """
3608
- map_params = {}
3609
- if params:
3610
- map_params.update(params.dict(exclude_none=True))
3611
-
3612
- # Add individual parameters
3613
- if search is not None:
3614
- map_params['search'] = search
3615
- if ignore_sitemap is not None:
3616
- map_params['ignoreSitemap'] = ignore_sitemap
3617
- if include_subdomains is not None:
3618
- map_params['includeSubdomains'] = include_subdomains
3619
- if sitemap_only is not None:
3620
- map_params['sitemapOnly'] = sitemap_only
3621
- if limit is not None:
3622
- map_params['limit'] = limit
3623
- if timeout is not None:
3624
- map_params['timeout'] = timeout
3625
-
3626
- # Create final params object
3627
- final_params = MapParams(**map_params)
3628
- params_dict = final_params.dict(exclude_none=True)
3629
- params_dict['url'] = url
3630
- params_dict['origin'] = f"python-sdk@{version}"
3631
-
3632
- # Make request
3633
- endpoint = f'/v1/map'
3634
- response = await self._async_post_request(
3635
- f'{self.api_url}{endpoint}',
3636
- params_dict,
3637
- headers={"Authorization": f"Bearer {self.api_key}"}
3638
- )
3639
-
3640
- if response.get('success') and 'links' in response:
3641
- return MapResponse(**response)
3642
- elif 'error' in response:
3643
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
3644
- else:
3645
- raise Exception(f'Failed to map URL. Error: {response}')
3646
-
3647
- async def extract(
3648
- self,
3649
- urls: Optional[List[str]] = None,
3650
- *,
3651
- prompt: Optional[str] = None,
3652
- schema: Optional[Any] = None,
3653
- system_prompt: Optional[str] = None,
3654
- allow_external_links: Optional[bool] = False,
3655
- enable_web_search: Optional[bool] = False,
3656
- show_sources: Optional[bool] = False,
3657
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3658
-
3659
- """
3660
- Asynchronously extract structured information from URLs.
3661
-
3662
- Args:
3663
- urls (Optional[List[str]]): URLs to extract from
3664
- prompt (Optional[str]): Custom extraction prompt
3665
- schema (Optional[Any]): JSON schema/Pydantic model
3666
- system_prompt (Optional[str]): System context
3667
- allow_external_links (Optional[bool]): Follow external links
3668
- enable_web_search (Optional[bool]): Enable web search
3669
- show_sources (Optional[bool]): Include source URLs
3670
- agent (Optional[Dict[str, Any]]): Agent configuration
3671
-
3672
- Returns:
3673
- ExtractResponse with:
3674
- * Structured data matching schema
3675
- * Source information if requested
3676
- * Success/error status
3677
-
3678
- Raises:
3679
- ValueError: If prompt/schema missing or extraction fails
3680
- """
3681
- headers = self._prepare_headers()
3682
-
3683
- if not prompt and not schema:
3684
- raise ValueError("Either prompt or schema is required")
3685
-
3686
- if not urls and not prompt:
3687
- raise ValueError("Either urls or prompt is required")
3688
-
3689
- if schema:
3690
- schema = self._ensure_schema_dict(schema)
3691
-
3692
- request_data = {
3693
- 'urls': urls or [],
3694
- 'allowExternalLinks': allow_external_links,
3695
- 'enableWebSearch': enable_web_search,
3696
- 'showSources': show_sources,
3697
- 'schema': schema,
3698
- 'origin': f'python-sdk@{get_version()}'
3699
- }
3700
-
3701
- # Only add prompt and systemPrompt if they exist
3702
- if prompt:
3703
- request_data['prompt'] = prompt
3704
- if system_prompt:
3705
- request_data['systemPrompt'] = system_prompt
3706
-
3707
- if agent:
3708
- request_data['agent'] = agent
3709
-
3710
- response = await self._async_post_request(
3711
- f'{self.api_url}/v1/extract',
3712
- request_data,
3713
- headers
3714
- )
3715
-
3716
- if response.get('success'):
3717
- job_id = response.get('id')
3718
- if not job_id:
3719
- raise Exception('Job ID not returned from extract request.')
3720
-
3721
- while True:
3722
- status_data = await self._async_get_request(
3723
- f'{self.api_url}/v1/extract/{job_id}',
3724
- headers
3725
- )
3726
-
3727
- if status_data['status'] == 'completed':
3728
- return ExtractResponse(**status_data)
3729
- elif status_data['status'] in ['failed', 'cancelled']:
3730
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3731
-
3732
- await asyncio.sleep(2)
3733
- else:
3734
- raise Exception(f'Failed to extract. Error: {response.get("error")}')
3735
-
3736
- async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3737
- """
3738
- Check the status of an asynchronous batch scrape job.
3739
-
3740
- Args:
3741
- id (str): The ID of the batch scrape job
3742
-
3743
- Returns:
3744
- BatchScrapeStatusResponse containing:
3745
- Status Information:
3746
- * status - Current state (scraping/completed/failed/cancelled)
3747
- * completed - Number of URLs scraped
3748
- * total - Total URLs to scrape
3749
- * creditsUsed - API credits consumed
3750
- * expiresAt - Data expiration timestamp
3751
-
3752
- Results:
3753
- * data - List of scraped documents
3754
- * next - URL for next page of results (if paginated)
3755
- * success - Whether status check succeeded
3756
- * error - Error message if failed
3757
-
3758
- Raises:
3759
- Exception: If status check fails
3760
- """
3761
- headers = self._prepare_headers()
3762
- endpoint = f'/v1/batch/scrape/{id}'
3763
-
3764
- status_data = await self._async_get_request(
3765
- f'{self.api_url}{endpoint}',
3766
- headers
3767
- )
3768
-
3769
- if status_data['status'] == 'completed':
3770
- if 'data' in status_data:
3771
- data = status_data['data']
3772
- while 'next' in status_data:
3773
- if len(status_data['data']) == 0:
3774
- break
3775
- next_url = status_data.get('next')
3776
- if not next_url:
3777
- logger.warning("Expected 'next' URL is missing.")
3778
- break
3779
- next_data = await self._async_get_request(next_url, headers)
3780
- data.extend(next_data.get('data', []))
3781
- status_data = next_data
3782
- status_data['data'] = data
3783
-
3784
- response = BatchScrapeStatusResponse(
3785
- status=status_data.get('status'),
3786
- total=status_data.get('total'),
3787
- completed=status_data.get('completed'),
3788
- creditsUsed=status_data.get('creditsUsed'),
3789
- expiresAt=status_data.get('expiresAt'),
3790
- data=status_data.get('data')
3791
- )
3792
-
3793
- if 'error' in status_data:
3794
- response['error'] = status_data['error']
3795
-
3796
- if 'next' in status_data:
3797
- response['next'] = status_data['next']
3798
-
3799
- return {
3800
- 'success': False if 'error' in status_data else True,
3801
- **response
3802
- }
3803
-
3804
- async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3805
- """
3806
- Get information about errors from an asynchronous batch scrape job.
3807
-
3808
- Args:
3809
- id (str): The ID of the batch scrape job
3810
-
3811
- Returns:
3812
- CrawlErrorsResponse containing:
3813
- errors (List[Dict[str, str]]): List of errors with fields:
3814
- * id (str): Error ID
3815
- * timestamp (str): When the error occurred
3816
- * url (str): URL that caused the error
3817
- * error (str): Error message
3818
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3819
-
3820
- Raises:
3821
- Exception: If error check fails
3822
- """
3823
- headers = self._prepare_headers()
3824
- return await self._async_get_request(
3825
- f'{self.api_url}/v1/batch/scrape/{id}/errors',
3826
- headers
3827
- )
3828
-
3829
- async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3830
- """
3831
- Get information about errors from an asynchronous crawl job.
3832
-
3833
- Args:
3834
- id (str): The ID of the crawl job
3835
-
3836
- Returns:
3837
- CrawlErrorsResponse containing:
3838
- * errors (List[Dict[str, str]]): List of errors with fields:
3839
- - id (str): Error ID
3840
- - timestamp (str): When the error occurred
3841
- - url (str): URL that caused the error
3842
- - error (str): Error message
3843
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3844
-
3845
- Raises:
3846
- Exception: If error check fails
3847
- """
3848
- headers = self._prepare_headers()
3849
- return await self._async_get_request(
3850
- f'{self.api_url}/v1/crawl/{id}/errors',
3851
- headers
3852
- )
3853
-
3854
- async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3855
- """
3856
- Cancel an asynchronous crawl job.
3857
-
3858
- Args:
3859
- id (str): The ID of the crawl job to cancel
3860
-
3861
- Returns:
3862
- Dict[str, Any] containing:
3863
- * success (bool): Whether cancellation was successful
3864
- * error (str, optional): Error message if cancellation failed
3865
-
3866
- Raises:
3867
- Exception: If cancellation fails
3868
- """
3869
- headers = self._prepare_headers()
3870
- async with aiohttp.ClientSession() as session:
3871
- async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3872
- return await response.json()
3873
-
3874
- async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3875
- """
3876
- Check the status of an asynchronous extraction job.
3877
-
3878
- Args:
3879
- job_id (str): The ID of the extraction job
3880
-
3881
- Returns:
3882
- ExtractResponse[Any] with:
3883
- * success (bool): Whether request succeeded
3884
- * data (Optional[Any]): Extracted data matching schema
3885
- * error (Optional[str]): Error message if any
3886
- * warning (Optional[str]): Warning message if any
3887
- * sources (Optional[List[str]]): Source URLs if requested
3888
-
3889
- Raises:
3890
- ValueError: If status check fails
3891
- """
3892
- headers = self._prepare_headers()
3893
- try:
3894
- return await self._async_get_request(
3895
- f'{self.api_url}/v1/extract/{job_id}',
3896
- headers
3897
- )
3898
- except Exception as e:
3899
- raise ValueError(str(e))
3900
-
3901
- async def async_extract(
3902
- self,
3903
- urls: Optional[List[str]] = None,
3904
- *,
3905
- prompt: Optional[str] = None,
3906
- schema: Optional[Any] = None,
3907
- system_prompt: Optional[str] = None,
3908
- allow_external_links: Optional[bool] = False,
3909
- enable_web_search: Optional[bool] = False,
3910
- show_sources: Optional[bool] = False,
3911
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3912
- """
3913
- Initiate an asynchronous extraction job without waiting for completion.
3914
-
3915
- Args:
3916
- urls (Optional[List[str]]): URLs to extract from
3917
- prompt (Optional[str]): Custom extraction prompt
3918
- schema (Optional[Any]): JSON schema/Pydantic model
3919
- system_prompt (Optional[str]): System context
3920
- allow_external_links (Optional[bool]): Follow external links
3921
- enable_web_search (Optional[bool]): Enable web search
3922
- show_sources (Optional[bool]): Include source URLs
3923
- agent (Optional[Dict[str, Any]]): Agent configuration
3924
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3925
-
3926
- Returns:
3927
- ExtractResponse[Any] with:
3928
- * success (bool): Whether request succeeded
3929
- * data (Optional[Any]): Extracted data matching schema
3930
- * error (Optional[str]): Error message if any
3931
-
3932
- Raises:
3933
- ValueError: If job initiation fails
3934
- """
3935
- headers = self._prepare_headers()
3936
-
3937
- if not prompt and not schema:
3938
- raise ValueError("Either prompt or schema is required")
3939
-
3940
- if not urls and not prompt:
3941
- raise ValueError("Either urls or prompt is required")
3942
-
3943
- if schema:
3944
- schema = self._ensure_schema_dict(schema)
3945
-
3946
- request_data = ExtractResponse(
3947
- urls=urls or [],
3948
- allowExternalLinks=allow_external_links,
3949
- enableWebSearch=enable_web_search,
3950
- showSources=show_sources,
3951
- schema=schema,
3952
- origin=f'python-sdk@{version}'
3953
- )
3954
-
3955
- if prompt:
3956
- request_data['prompt'] = prompt
3957
- if system_prompt:
3958
- request_data['systemPrompt'] = system_prompt
3959
- if agent:
3960
- request_data['agent'] = agent
3961
-
3962
- try:
3963
- return await self._async_post_request(
3964
- f'{self.api_url}/v1/extract',
3965
- request_data,
3966
- headers
3967
- )
3968
- except Exception as e:
3969
- raise ValueError(str(e))
3970
-
3971
- async def generate_llms_text(
3972
- self,
3973
- url: str,
3974
- *,
3975
- max_urls: Optional[int] = None,
3976
- show_full_text: Optional[bool] = None,
3977
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3978
- """
3979
- Generate LLMs.txt for a given URL and monitor until completion.
3980
-
3981
- Args:
3982
- url (str): Target URL to generate LLMs.txt from
3983
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
3984
- show_full_text (Optional[bool]): Include full text in output (default: False)
3985
- experimental_stream (Optional[bool]): Enable experimental streaming
3986
-
3987
- Returns:
3988
- GenerateLLMsTextStatusResponse containing:
3989
- * success (bool): Whether generation completed successfully
3990
- * status (str): Status of generation (processing/completed/failed)
3991
- * data (Dict[str, str], optional): Generated text with fields:
3992
- - llmstxt (str): Generated LLMs.txt content
3993
- - llmsfulltxt (str, optional): Full version if requested
3994
- * error (str, optional): Error message if generation failed
3995
- * expiresAt (str): When the generated data expires
3996
-
3997
- Raises:
3998
- Exception: If generation fails
3999
- """
4000
- params = {}
4001
- if max_urls is not None:
4002
- params['maxUrls'] = max_urls
4003
- if show_full_text is not None:
4004
- params['showFullText'] = show_full_text
4005
- if experimental_stream is not None:
4006
- params['__experimental_stream'] = experimental_stream
4007
-
4008
- response = await self.async_generate_llms_text(
4009
- url,
4010
- max_urls=max_urls,
4011
- show_full_text=show_full_text,
4012
- cache=cache,
4013
- experimental_stream=experimental_stream
4014
- )
4015
- if not response.get('success') or 'id' not in response:
4016
- return response
4017
-
4018
- job_id = response['id']
4019
- while True:
4020
- status = await self.check_generate_llms_text_status(job_id)
4021
-
4022
- if status['status'] == 'completed':
4023
- return status
4024
- elif status['status'] == 'failed':
4025
- raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4026
- elif status['status'] != 'processing':
4027
- break
4028
-
4029
- await asyncio.sleep(2)
4030
-
4031
- return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4032
-
4033
- async def async_generate_llms_text(
4034
- self,
4035
- url: str,
4036
- *,
4037
- max_urls: Optional[int] = None,
4038
- show_full_text: Optional[bool] = None,
4039
- cache: Optional[bool] = None,
4040
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4041
- """
4042
- Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4043
-
4044
- Args:
4045
- url (str): Target URL to generate LLMs.txt from
4046
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
4047
- show_full_text (Optional[bool]): Include full text in output (default: False)
4048
- cache (Optional[bool]): Whether to use cached content if available (default: True)
4049
- experimental_stream (Optional[bool]): Enable experimental streaming
4050
-
4051
- Returns:
4052
- GenerateLLMsTextResponse containing:
4053
- * success (bool): Whether job started successfully
4054
- * id (str): Unique identifier for the job
4055
- * error (str, optional): Error message if start failed
4056
-
4057
- Raises:
4058
- ValueError: If job initiation fails
4059
- """
4060
- params = {}
4061
- if max_urls is not None:
4062
- params['maxUrls'] = max_urls
4063
- if show_full_text is not None:
4064
- params['showFullText'] = show_full_text
4065
- if experimental_stream is not None:
4066
- params['__experimental_stream'] = experimental_stream
4067
-
4068
- params = GenerateLLMsTextParams(
4069
- maxUrls=max_urls,
4070
- showFullText=show_full_text,
4071
- cache=cache,
4072
- __experimental_stream=experimental_stream
4073
- )
4074
-
4075
- headers = self._prepare_headers()
4076
- json_data = {'url': url, **params.dict(exclude_none=True)}
4077
- json_data['origin'] = f"python-sdk@{version}"
4078
-
4079
- try:
4080
- return await self._async_post_request(
4081
- f'{self.api_url}/v1/llmstxt',
4082
- json_data,
4083
- headers
4084
- )
4085
- except Exception as e:
4086
- raise ValueError(str(e))
4087
-
4088
- async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4089
- """
4090
- Check the status of an asynchronous LLMs.txt generation job.
4091
-
4092
- Args:
4093
- id (str): The ID of the generation job
4094
-
4095
- Returns:
4096
- GenerateLLMsTextStatusResponse containing:
4097
- * success (bool): Whether generation completed successfully
4098
- * status (str): Status of generation (processing/completed/failed)
4099
- * data (Dict[str, str], optional): Generated text with fields:
4100
- - llmstxt (str): Generated LLMs.txt content
4101
- - llmsfulltxt (str, optional): Full version if requested
4102
- * error (str, optional): Error message if generation failed
4103
- * expiresAt (str): When the generated data expires
4104
-
4105
- Raises:
4106
- ValueError: If status check fails
4107
- """
4108
- headers = self._prepare_headers()
4109
- try:
4110
- return await self._async_get_request(
4111
- f'{self.api_url}/v1/llmstxt/{id}',
4112
- headers
4113
- )
4114
- except Exception as e:
4115
- raise ValueError(str(e))
4116
-
4117
- async def deep_research(
4118
- self,
4119
- query: str,
4120
- *,
4121
- max_depth: Optional[int] = None,
4122
- time_limit: Optional[int] = None,
4123
- max_urls: Optional[int] = None,
4124
- analysis_prompt: Optional[str] = None,
4125
- system_prompt: Optional[str] = None,
4126
- __experimental_stream_steps: Optional[bool] = None,
4127
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4128
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4129
- """
4130
- Initiates a deep research operation on a given query and polls until completion.
4131
-
4132
- Args:
4133
- query (str): Research query or topic to investigate
4134
- max_depth (Optional[int]): Maximum depth of research exploration
4135
- time_limit (Optional[int]): Time limit in seconds for research
4136
- max_urls (Optional[int]): Maximum number of URLs to process
4137
- analysis_prompt (Optional[str]): Custom prompt for analysis
4138
- system_prompt (Optional[str]): Custom system prompt
4139
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4140
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4141
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4142
-
4143
- Returns:
4144
- DeepResearchStatusResponse containing:
4145
- * success (bool): Whether research completed successfully
4146
- * status (str): Current state (processing/completed/failed)
4147
- * error (Optional[str]): Error message if failed
4148
- * id (str): Unique identifier for the research job
4149
- * data (Any): Research findings and analysis
4150
- * sources (List[Dict]): List of discovered sources
4151
- * activities (List[Dict]): Research progress log
4152
- * summaries (List[str]): Generated research summaries
4153
-
4154
- Raises:
4155
- Exception: If research fails
4156
- """
4157
- research_params = {}
4158
- if max_depth is not None:
4159
- research_params['maxDepth'] = max_depth
4160
- if time_limit is not None:
4161
- research_params['timeLimit'] = time_limit
4162
- if max_urls is not None:
4163
- research_params['maxUrls'] = max_urls
4164
- if analysis_prompt is not None:
4165
- research_params['analysisPrompt'] = analysis_prompt
4166
- if system_prompt is not None:
4167
- research_params['systemPrompt'] = system_prompt
4168
- if __experimental_stream_steps is not None:
4169
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4170
- research_params = DeepResearchParams(**research_params)
4171
-
4172
- response = await self.async_deep_research(
4173
- query,
4174
- max_depth=max_depth,
4175
- time_limit=time_limit,
4176
- max_urls=max_urls,
4177
- analysis_prompt=analysis_prompt,
4178
- system_prompt=system_prompt
4179
- )
4180
- if not response.get('success') or 'id' not in response:
4181
- return response
4182
-
4183
- job_id = response['id']
4184
- last_activity_count = 0
4185
- last_source_count = 0
4186
-
4187
- while True:
4188
- status = await self.check_deep_research_status(job_id)
4189
-
4190
- if on_activity and 'activities' in status:
4191
- new_activities = status['activities'][last_activity_count:]
4192
- for activity in new_activities:
4193
- on_activity(activity)
4194
- last_activity_count = len(status['activities'])
4195
-
4196
- if on_source and 'sources' in status:
4197
- new_sources = status['sources'][last_source_count:]
4198
- for source in new_sources:
4199
- on_source(source)
4200
- last_source_count = len(status['sources'])
4201
-
4202
- if status['status'] == 'completed':
4203
- return status
4204
- elif status['status'] == 'failed':
4205
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
4206
- elif status['status'] != 'processing':
4207
- break
4208
-
4209
- await asyncio.sleep(2)
4210
-
4211
- return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4212
-
4213
- async def async_deep_research(
4214
- self,
4215
- query: str,
4216
- *,
4217
- max_depth: Optional[int] = None,
4218
- time_limit: Optional[int] = None,
4219
- max_urls: Optional[int] = None,
4220
- analysis_prompt: Optional[str] = None,
4221
- system_prompt: Optional[str] = None,
4222
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4223
- """
4224
- Initiates an asynchronous deep research operation.
4225
-
4226
- Args:
4227
- query (str): Research query or topic to investigate
4228
- max_depth (Optional[int]): Maximum depth of research exploration
4229
- time_limit (Optional[int]): Time limit in seconds for research
4230
- max_urls (Optional[int]): Maximum number of URLs to process
4231
- analysis_prompt (Optional[str]): Custom prompt for analysis
4232
- system_prompt (Optional[str]): Custom system prompt
4233
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4234
-
4235
- Returns:
4236
- Dict[str, Any]: A response containing:
4237
- * success (bool): Whether the research initiation was successful
4238
- * id (str): The unique identifier for the research job
4239
- * error (str, optional): Error message if initiation failed
4240
-
4241
- Raises:
4242
- Exception: If the research initiation fails.
4243
- """
4244
- research_params = {}
4245
- if max_depth is not None:
4246
- research_params['maxDepth'] = max_depth
4247
- if time_limit is not None:
4248
- research_params['timeLimit'] = time_limit
4249
- if max_urls is not None:
4250
- research_params['maxUrls'] = max_urls
4251
- if analysis_prompt is not None:
4252
- research_params['analysisPrompt'] = analysis_prompt
4253
- if system_prompt is not None:
4254
- research_params['systemPrompt'] = system_prompt
4255
- if __experimental_stream_steps is not None:
4256
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4257
- research_params = DeepResearchParams(**research_params)
4258
-
4259
- headers = self._prepare_headers()
4260
-
4261
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
4262
- json_data['origin'] = f"python-sdk@{version}"
4263
-
4264
- try:
4265
- return await self._async_post_request(
4266
- f'{self.api_url}/v1/deep-research',
4267
- json_data,
4268
- headers
4269
- )
4270
- except Exception as e:
4271
- raise ValueError(str(e))
4272
-
4273
- async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4274
- """
4275
- Check the status of a deep research operation.
4276
-
4277
- Args:
4278
- id (str): The ID of the deep research operation.
4279
-
4280
- Returns:
4281
- DeepResearchResponse containing:
4282
-
4283
- Status:
4284
- * success - Whether research completed successfully
4285
- * status - Current state (processing/completed/failed)
4286
- * error - Error message if failed
4287
-
4288
- Results:
4289
- * id - Unique identifier for the research job
4290
- * data - Research findings and analysis
4291
- * sources - List of discovered sources
4292
- * activities - Research progress log
4293
- * summaries - Generated research summaries
4294
-
4295
- Raises:
4296
- Exception: If the status check fails.
4297
- """
4298
- headers = self._prepare_headers()
4299
- try:
4300
- return await self._async_get_request(
4301
- f'{self.api_url}/v1/deep-research/{id}',
4302
- headers
4303
- )
4304
- except Exception as e:
4305
- raise ValueError(str(e))
4306
-
4307
- async def search(
4308
- self,
4309
- query: str,
4310
- *,
4311
- limit: Optional[int] = None,
4312
- tbs: Optional[str] = None,
4313
- filter: Optional[str] = None,
4314
- lang: Optional[str] = None,
4315
- country: Optional[str] = None,
4316
- location: Optional[str] = None,
4317
- timeout: Optional[int] = None,
4318
- scrape_options: Optional[ScrapeOptions] = None,
4319
- params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4320
- **kwargs) -> SearchResponse:
4321
- """
4322
- Asynchronously search for content using Firecrawl.
4323
-
4324
- Args:
4325
- query (str): Search query string
4326
- limit (Optional[int]): Max results (default: 5)
4327
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
4328
- filter (Optional[str]): Custom result filter
4329
- lang (Optional[str]): Language code (default: "en")
4330
- country (Optional[str]): Country code (default: "us")
4331
- location (Optional[str]): Geo-targeting
4332
- timeout (Optional[int]): Request timeout in milliseconds
4333
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4334
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4335
- **kwargs: Additional keyword arguments for future compatibility
4336
-
4337
- Returns:
4338
- SearchResponse: Response containing:
4339
- * success (bool): Whether request succeeded
4340
- * data (List[FirecrawlDocument]): Search results
4341
- * warning (Optional[str]): Warning message if any
4342
- * error (Optional[str]): Error message if any
4343
-
4344
- Raises:
4345
- Exception: If search fails or response cannot be parsed
4346
- """
4347
- # Build search parameters
4348
- search_params = {}
4349
- if params:
4350
- if isinstance(params, dict):
4351
- search_params.update(params)
4352
- else:
4353
- search_params.update(params.dict(exclude_none=True))
4354
-
4355
- # Add individual parameters
4356
- if limit is not None:
4357
- search_params['limit'] = limit
4358
- if tbs is not None:
4359
- search_params['tbs'] = tbs
4360
- if filter is not None:
4361
- search_params['filter'] = filter
4362
- if lang is not None:
4363
- search_params['lang'] = lang
4364
- if country is not None:
4365
- search_params['country'] = country
4366
- if location is not None:
4367
- search_params['location'] = location
4368
- if timeout is not None:
4369
- search_params['timeout'] = timeout
4370
- if scrape_options is not None:
4371
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4372
-
4373
- # Add any additional kwargs
4374
- search_params.update(kwargs)
4375
-
4376
- # Create final params object
4377
- final_params = SearchParams(query=query, **search_params)
4378
- params_dict = final_params.dict(exclude_none=True)
4379
- params_dict['origin'] = f"python-sdk@{version}"
4380
-
4381
- return await self._async_post_request(
4382
- f"{self.api_url}/v1/search",
4383
- params_dict,
4384
- {"Authorization": f"Bearer {self.api_key}"}
4385
- )
4386
-
4387
- class AsyncCrawlWatcher(CrawlWatcher):
4388
- """
4389
- Async version of CrawlWatcher that properly handles async operations.
4390
- """
4391
- def __init__(self, id: str, app: AsyncFirecrawlApp):
4392
- super().__init__(id, app)
4393
-
4394
- async def connect(self) -> None:
4395
- """
4396
- Establishes async WebSocket connection and starts listening for messages.
4397
- """
4398
- async with websockets.connect(
4399
- self.ws_url,
4400
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4401
- ) as websocket:
4402
- await self._listen(websocket)
4403
-
4404
- async def _listen(self, websocket) -> None:
4405
- """
4406
- Listens for incoming WebSocket messages and handles them asynchronously.
4407
-
4408
- Args:
4409
- websocket: The WebSocket connection object
4410
- """
4411
- async for message in websocket:
4412
- msg = json.loads(message)
4413
- await self._handle_message(msg)
4414
-
4415
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
4416
- """
4417
- Handles incoming WebSocket messages based on their type asynchronously.
4418
-
4419
- Args:
4420
- msg (Dict[str, Any]): The message to handle
4421
- """
4422
- if msg['type'] == 'done':
4423
- self.status = 'completed'
4424
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4425
- elif msg['type'] == 'error':
4426
- self.status = 'failed'
4427
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4428
- elif msg['type'] == 'catchup':
4429
- self.status = msg['data']['status']
4430
- self.data.extend(msg['data'].get('data', []))
4431
- for doc in self.data:
4432
- self.dispatch_event('document', {'data': doc, 'id': self.id})
4433
- elif msg['type'] == 'document':
4434
- self.data.append(msg['data'])
4435
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4436
-
4437
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4438
- """
4439
- Handle errors from async API responses.
4440
- """
4441
- try:
4442
- error_data = await response.json()
4443
- error_message = error_data.get('error', 'No error message provided.')
4444
- error_details = error_data.get('details', 'No additional error details provided.')
4445
- except:
4446
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4447
-
4448
- # Use the app's method to get the error message
4449
- message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4450
-
4451
- raise aiohttp.ClientError(message)
4452
-
4453
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4454
- """
4455
- Generate a standardized error message based on HTTP status code for async operations.
4456
-
4457
- Args:
4458
- status_code (int): The HTTP status code from the response
4459
- action (str): Description of the action that was being performed
4460
- error_message (str): The error message from the API response
4461
- error_details (str): Additional error details from the API response
4462
-
4463
- Returns:
4464
- str: A formatted error message
4465
- """
4466
- return self._get_error_message(status_code, action, error_message, error_details)