firecrawl 2.5.4__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (50) hide show
  1. firecrawl/__init__.py +1 -1
  2. firecrawl/firecrawl.py +13 -0
  3. {firecrawl-2.5.4.dist-info → firecrawl-2.7.0.dist-info}/LICENSE +0 -0
  4. {firecrawl-2.5.4.dist-info → firecrawl-2.7.0.dist-info}/METADATA +1 -1
  5. firecrawl-2.7.0.dist-info/RECORD +12 -0
  6. {firecrawl-2.5.4.dist-info → firecrawl-2.7.0.dist-info}/top_level.txt +0 -2
  7. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  8. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  9. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  10. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  11. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  12. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
  13. build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  14. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  15. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  16. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  17. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  18. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  19. build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
  20. build/lib/build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  21. build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  22. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  23. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  24. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  25. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  26. build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
  27. build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  28. build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  29. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  30. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  31. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  32. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  33. build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
  34. build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  35. build/lib/build/lib/firecrawl/__init__.py +0 -79
  36. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  37. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  38. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  39. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  40. build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
  41. build/lib/build/lib/tests/test_change_tracking.py +0 -98
  42. build/lib/firecrawl/__init__.py +0 -79
  43. build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  44. build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  45. build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  46. build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  47. build/lib/firecrawl/firecrawl.py +0 -4454
  48. build/lib/tests/test_change_tracking.py +0 -98
  49. firecrawl-2.5.4.dist-info/RECORD +0 -54
  50. {firecrawl-2.5.4.dist-info → firecrawl-2.7.0.dist-info}/WHEEL +0 -0
@@ -1,4454 +0,0 @@
1
- """
2
- FirecrawlApp Module
3
-
4
- This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs. The module uses requests for HTTP communication
7
- and handles retries for certain HTTP status codes.
8
-
9
- Classes:
10
- - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
- """
12
- import logging
13
- import os
14
- import time
15
- from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
- import json
17
- from datetime import datetime
18
- import re
19
- import warnings
20
- import requests
21
- import pydantic
22
- import websockets
23
- import aiohttp
24
- import asyncio
25
- from pydantic import Field
26
-
27
- # Suppress Pydantic warnings about attribute shadowing
28
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
-
34
- def get_version():
35
- try:
36
- from pathlib import Path
37
- package_path = os.path.dirname(__file__)
38
- version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
- if version_match:
41
- return version_match.group(1).strip()
42
- except Exception:
43
- print("Failed to get version from __init__.py")
44
- return None
45
-
46
- version = get_version()
47
-
48
- logger : logging.Logger = logging.getLogger("firecrawl")
49
-
50
- T = TypeVar('T')
51
-
52
- # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
- # """Metadata for a Firecrawl document."""
54
- # title: Optional[str] = None
55
- # description: Optional[str] = None
56
- # language: Optional[str] = None
57
- # keywords: Optional[str] = None
58
- # robots: Optional[str] = None
59
- # ogTitle: Optional[str] = None
60
- # ogDescription: Optional[str] = None
61
- # ogUrl: Optional[str] = None
62
- # ogImage: Optional[str] = None
63
- # ogAudio: Optional[str] = None
64
- # ogDeterminer: Optional[str] = None
65
- # ogLocale: Optional[str] = None
66
- # ogLocaleAlternate: Optional[List[str]] = None
67
- # ogSiteName: Optional[str] = None
68
- # ogVideo: Optional[str] = None
69
- # dctermsCreated: Optional[str] = None
70
- # dcDateCreated: Optional[str] = None
71
- # dcDate: Optional[str] = None
72
- # dctermsType: Optional[str] = None
73
- # dcType: Optional[str] = None
74
- # dctermsAudience: Optional[str] = None
75
- # dctermsSubject: Optional[str] = None
76
- # dcSubject: Optional[str] = None
77
- # dcDescription: Optional[str] = None
78
- # dctermsKeywords: Optional[str] = None
79
- # modifiedTime: Optional[str] = None
80
- # publishedTime: Optional[str] = None
81
- # articleTag: Optional[str] = None
82
- # articleSection: Optional[str] = None
83
- # sourceURL: Optional[str] = None
84
- # statusCode: Optional[int] = None
85
- # error: Optional[str] = None
86
-
87
- class AgentOptions(pydantic.BaseModel):
88
- """Configuration for the agent."""
89
- model: Literal["FIRE-1"] = "FIRE-1"
90
- prompt: Optional[str] = None
91
-
92
- class AgentOptionsExtract(pydantic.BaseModel):
93
- """Configuration for the agent in extract operations."""
94
- model: Literal["FIRE-1"] = "FIRE-1"
95
-
96
- class ActionsResult(pydantic.BaseModel):
97
- """Result of actions performed during scraping."""
98
- screenshots: List[str]
99
-
100
- class ChangeTrackingData(pydantic.BaseModel):
101
- """
102
- Data for the change tracking format.
103
- """
104
- previousScrapeAt: Optional[str] = None
105
- changeStatus: str # "new" | "same" | "changed" | "removed"
106
- visibility: str # "visible" | "hidden"
107
- diff: Optional[Dict[str, Any]] = None
108
- json: Optional[Any] = None
109
-
110
- class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
- """Document retrieved or processed by Firecrawl."""
112
- url: Optional[str] = None
113
- markdown: Optional[str] = None
114
- html: Optional[str] = None
115
- rawHtml: Optional[str] = None
116
- links: Optional[List[str]] = None
117
- extract: Optional[T] = None
118
- json: Optional[T] = None
119
- screenshot: Optional[str] = None
120
- metadata: Optional[Any] = None
121
- actions: Optional[ActionsResult] = None
122
- title: Optional[str] = None # v1 search only
123
- description: Optional[str] = None # v1 search only
124
- changeTracking: Optional[ChangeTrackingData] = None
125
-
126
- class LocationConfig(pydantic.BaseModel):
127
- """Location configuration for scraping."""
128
- country: Optional[str] = None
129
- languages: Optional[List[str]] = None
130
-
131
- class WebhookConfig(pydantic.BaseModel):
132
- """Configuration for webhooks."""
133
- url: str
134
- headers: Optional[Dict[str, str]] = None
135
- metadata: Optional[Dict[str, str]] = None
136
- events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
-
138
- class ChangeTrackingOptions(pydantic.BaseModel):
139
- """Configuration for change tracking."""
140
- modes: Optional[List[Literal["git-diff", "json"]]] = None
141
- schema: Optional[Any] = None
142
- prompt: Optional[str] = None
143
-
144
- class ScrapeOptions(pydantic.BaseModel):
145
- """Parameters for scraping operations."""
146
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
147
- headers: Optional[Dict[str, str]] = None
148
- includeTags: Optional[List[str]] = None
149
- excludeTags: Optional[List[str]] = None
150
- onlyMainContent: Optional[bool] = None
151
- waitFor: Optional[int] = None
152
- timeout: Optional[int] = None
153
- location: Optional[LocationConfig] = None
154
- mobile: Optional[bool] = None
155
- skipTlsVerification: Optional[bool] = None
156
- removeBase64Images: Optional[bool] = None
157
- blockAds: Optional[bool] = None
158
- proxy: Optional[Literal["basic", "stealth"]] = None
159
- changeTrackingOptions: Optional[ChangeTrackingOptions] = None
160
-
161
- class WaitAction(pydantic.BaseModel):
162
- """Wait action to perform during scraping."""
163
- type: Literal["wait"]
164
- milliseconds: Optional[int] = None
165
- selector: Optional[str] = None
166
-
167
- class ScreenshotAction(pydantic.BaseModel):
168
- """Screenshot action to perform during scraping."""
169
- type: Literal["screenshot"]
170
- fullPage: Optional[bool] = None
171
-
172
- class ClickAction(pydantic.BaseModel):
173
- """Click action to perform during scraping."""
174
- type: Literal["click"]
175
- selector: str
176
-
177
- class WriteAction(pydantic.BaseModel):
178
- """Write action to perform during scraping."""
179
- type: Literal["write"]
180
- text: str
181
-
182
- class PressAction(pydantic.BaseModel):
183
- """Press action to perform during scraping."""
184
- type: Literal["press"]
185
- key: str
186
-
187
- class ScrollAction(pydantic.BaseModel):
188
- """Scroll action to perform during scraping."""
189
- type: Literal["scroll"]
190
- direction: Literal["up", "down"]
191
- selector: Optional[str] = None
192
-
193
- class ScrapeAction(pydantic.BaseModel):
194
- """Scrape action to perform during scraping."""
195
- type: Literal["scrape"]
196
-
197
- class ExecuteJavascriptAction(pydantic.BaseModel):
198
- """Execute javascript action to perform during scraping."""
199
- type: Literal["executeJavascript"]
200
- script: str
201
-
202
-
203
- class ExtractAgent(pydantic.BaseModel):
204
- """Configuration for the agent in extract operations."""
205
- model: Literal["FIRE-1"] = "FIRE-1"
206
-
207
- class JsonConfig(pydantic.BaseModel):
208
- """Configuration for extraction."""
209
- prompt: Optional[str] = None
210
- schema: Optional[Any] = None
211
- systemPrompt: Optional[str] = None
212
- agent: Optional[ExtractAgent] = None
213
-
214
- class ScrapeParams(ScrapeOptions):
215
- """Parameters for scraping operations."""
216
- extract: Optional[JsonConfig] = None
217
- jsonOptions: Optional[JsonConfig] = None
218
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
219
- agent: Optional[AgentOptions] = None
220
- webhook: Optional[WebhookConfig] = None
221
-
222
- class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
223
- """Response from scraping operations."""
224
- success: bool = True
225
- warning: Optional[str] = None
226
- error: Optional[str] = None
227
-
228
- class BatchScrapeResponse(pydantic.BaseModel):
229
- """Response from batch scrape operations."""
230
- id: Optional[str] = None
231
- url: Optional[str] = None
232
- success: bool = True
233
- error: Optional[str] = None
234
- invalidURLs: Optional[List[str]] = None
235
-
236
- class BatchScrapeStatusResponse(pydantic.BaseModel):
237
- """Response from batch scrape status checks."""
238
- success: bool = True
239
- status: Literal["scraping", "completed", "failed", "cancelled"]
240
- completed: int
241
- total: int
242
- creditsUsed: int
243
- expiresAt: datetime
244
- next: Optional[str] = None
245
- data: List[FirecrawlDocument]
246
-
247
- class CrawlParams(pydantic.BaseModel):
248
- """Parameters for crawling operations."""
249
- includePaths: Optional[List[str]] = None
250
- excludePaths: Optional[List[str]] = None
251
- maxDepth: Optional[int] = None
252
- maxDiscoveryDepth: Optional[int] = None
253
- limit: Optional[int] = None
254
- allowBackwardLinks: Optional[bool] = None
255
- allowExternalLinks: Optional[bool] = None
256
- ignoreSitemap: Optional[bool] = None
257
- scrapeOptions: Optional[ScrapeOptions] = None
258
- webhook: Optional[Union[str, WebhookConfig]] = None
259
- deduplicateSimilarURLs: Optional[bool] = None
260
- ignoreQueryParameters: Optional[bool] = None
261
- regexOnFullURL: Optional[bool] = None
262
- delay: Optional[int] = None # Delay in seconds between scrapes
263
-
264
- class CrawlResponse(pydantic.BaseModel):
265
- """Response from crawling operations."""
266
- id: Optional[str] = None
267
- url: Optional[str] = None
268
- success: bool = True
269
- error: Optional[str] = None
270
-
271
- class CrawlStatusResponse(pydantic.BaseModel):
272
- """Response from crawl status checks."""
273
- success: bool = True
274
- status: Literal["scraping", "completed", "failed", "cancelled"]
275
- completed: int
276
- total: int
277
- creditsUsed: int
278
- expiresAt: datetime
279
- next: Optional[str] = None
280
- data: List[FirecrawlDocument]
281
-
282
- class CrawlErrorsResponse(pydantic.BaseModel):
283
- """Response from crawl/batch scrape error monitoring."""
284
- errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
285
- robotsBlocked: List[str]
286
-
287
- class MapParams(pydantic.BaseModel):
288
- """Parameters for mapping operations."""
289
- search: Optional[str] = None
290
- ignoreSitemap: Optional[bool] = None
291
- includeSubdomains: Optional[bool] = None
292
- sitemapOnly: Optional[bool] = None
293
- limit: Optional[int] = None
294
- timeout: Optional[int] = None
295
-
296
- class MapResponse(pydantic.BaseModel):
297
- """Response from mapping operations."""
298
- success: bool = True
299
- links: Optional[List[str]] = None
300
- error: Optional[str] = None
301
-
302
- class ExtractParams(pydantic.BaseModel):
303
- """Parameters for extracting information from URLs."""
304
- prompt: Optional[str] = None
305
- schema: Optional[Any] = None
306
- systemPrompt: Optional[str] = None
307
- allowExternalLinks: Optional[bool] = None
308
- enableWebSearch: Optional[bool] = None
309
- includeSubdomains: Optional[bool] = None
310
- origin: Optional[str] = None
311
- showSources: Optional[bool] = None
312
- scrapeOptions: Optional[ScrapeOptions] = None
313
-
314
- class ExtractResponse(pydantic.BaseModel, Generic[T]):
315
- """Response from extract operations."""
316
- id: Optional[str] = None
317
- status: Optional[Literal["processing", "completed", "failed"]] = None
318
- expiresAt: Optional[datetime] = None
319
- success: bool = True
320
- data: Optional[T] = None
321
- error: Optional[str] = None
322
- warning: Optional[str] = None
323
- sources: Optional[List[str]] = None
324
-
325
- class SearchParams(pydantic.BaseModel):
326
- query: str
327
- limit: Optional[int] = 5
328
- tbs: Optional[str] = None
329
- filter: Optional[str] = None
330
- lang: Optional[str] = "en"
331
- country: Optional[str] = "us"
332
- location: Optional[str] = None
333
- origin: Optional[str] = "api"
334
- timeout: Optional[int] = 60000
335
- scrapeOptions: Optional[ScrapeOptions] = None
336
-
337
- class SearchResponse(pydantic.BaseModel):
338
- """Response from search operations."""
339
- success: bool = True
340
- data: List[FirecrawlDocument]
341
- warning: Optional[str] = None
342
- error: Optional[str] = None
343
-
344
- class GenerateLLMsTextParams(pydantic.BaseModel):
345
- """
346
- Parameters for the LLMs.txt generation operation.
347
- """
348
- maxUrls: Optional[int] = 10
349
- showFullText: Optional[bool] = False
350
- __experimental_stream: Optional[bool] = None
351
-
352
- class DeepResearchParams(pydantic.BaseModel):
353
- """
354
- Parameters for the deep research operation.
355
- """
356
- maxDepth: Optional[int] = 7
357
- timeLimit: Optional[int] = 270
358
- maxUrls: Optional[int] = 20
359
- analysisPrompt: Optional[str] = None
360
- systemPrompt: Optional[str] = None
361
- __experimental_streamSteps: Optional[bool] = None
362
-
363
- class DeepResearchResponse(pydantic.BaseModel):
364
- """
365
- Response from the deep research operation.
366
- """
367
- success: bool
368
- id: str
369
- error: Optional[str] = None
370
-
371
- class DeepResearchStatusResponse(pydantic.BaseModel):
372
- """
373
- Status response from the deep research operation.
374
- """
375
- success: bool
376
- data: Optional[Dict[str, Any]] = None
377
- status: str
378
- error: Optional[str] = None
379
- expiresAt: str
380
- currentDepth: int
381
- maxDepth: int
382
- activities: List[Dict[str, Any]]
383
- sources: List[Dict[str, Any]]
384
- summaries: List[str]
385
-
386
- class GenerateLLMsTextResponse(pydantic.BaseModel):
387
- """Response from LLMs.txt generation operations."""
388
- success: bool = True
389
- id: str
390
- error: Optional[str] = None
391
-
392
- class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
393
- llmstxt: str
394
- llmsfulltxt: Optional[str] = None
395
-
396
- class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
397
- """Status response from LLMs.txt generation operations."""
398
- success: bool = True
399
- data: Optional[GenerateLLMsTextStatusResponseData] = None
400
- status: Literal["processing", "completed", "failed"]
401
- error: Optional[str] = None
402
- expiresAt: str
403
-
404
- class SearchResponse(pydantic.BaseModel):
405
- """
406
- Response from the search operation.
407
- """
408
- success: bool
409
- data: List[Dict[str, Any]]
410
- warning: Optional[str] = None
411
- error: Optional[str] = None
412
-
413
- class ExtractParams(pydantic.BaseModel):
414
- """
415
- Parameters for the extract operation.
416
- """
417
- prompt: Optional[str] = None
418
- schema: Optional[Any] = pydantic.Field(None, alias='schema')
419
- system_prompt: Optional[str] = None
420
- allow_external_links: Optional[bool] = False
421
- enable_web_search: Optional[bool] = False
422
- # Just for backwards compatibility
423
- enableWebSearch: Optional[bool] = False
424
- show_sources: Optional[bool] = False
425
- agent: Optional[Dict[str, Any]] = None
426
-
427
- class FirecrawlApp:
428
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
429
- """
430
- Initialize the FirecrawlApp instance with API key, API URL.
431
-
432
- Args:
433
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
434
- api_url (Optional[str]): Base URL for the Firecrawl API.
435
- """
436
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
437
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
438
-
439
- # Only require API key when using cloud service
440
- if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
441
- logger.warning("No API key provided for cloud service")
442
- raise ValueError('No API key provided')
443
-
444
- logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
445
-
446
- def scrape_url(
447
- self,
448
- url: str,
449
- *,
450
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
451
- include_tags: Optional[List[str]] = None,
452
- exclude_tags: Optional[List[str]] = None,
453
- only_main_content: Optional[bool] = None,
454
- wait_for: Optional[int] = None,
455
- timeout: Optional[int] = None,
456
- location: Optional[LocationConfig] = None,
457
- mobile: Optional[bool] = None,
458
- skip_tls_verification: Optional[bool] = None,
459
- remove_base64_images: Optional[bool] = None,
460
- block_ads: Optional[bool] = None,
461
- proxy: Optional[Literal["basic", "stealth"]] = None,
462
- extract: Optional[JsonConfig] = None,
463
- json_options: Optional[JsonConfig] = None,
464
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
465
- change_tracking_options: Optional[ChangeTrackingOptions] = None,
466
- **kwargs) -> ScrapeResponse[Any]:
467
- """
468
- Scrape and extract content from a URL.
469
-
470
- Args:
471
- url (str): Target URL to scrape
472
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
473
- include_tags (Optional[List[str]]): HTML tags to include
474
- exclude_tags (Optional[List[str]]): HTML tags to exclude
475
- only_main_content (Optional[bool]): Extract main content only
476
- wait_for (Optional[int]): Wait for a specific element to appear
477
- timeout (Optional[int]): Request timeout (ms)
478
- location (Optional[LocationConfig]): Location configuration
479
- mobile (Optional[bool]): Use mobile user agent
480
- skip_tls_verification (Optional[bool]): Skip TLS verification
481
- remove_base64_images (Optional[bool]): Remove base64 images
482
- block_ads (Optional[bool]): Block ads
483
- proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
484
- extract (Optional[JsonConfig]): Content extraction settings
485
- json_options (Optional[JsonConfig]): JSON extraction settings
486
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
487
- change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
488
-
489
-
490
- Returns:
491
- ScrapeResponse with:
492
- * Requested content formats
493
- * Page metadata
494
- * Extraction results
495
- * Success/error status
496
-
497
- Raises:
498
- Exception: If scraping fails
499
- """
500
- headers = self._prepare_headers()
501
-
502
- # Build scrape parameters
503
- scrape_params = {
504
- 'url': url,
505
- 'origin': f"python-sdk@{version}"
506
- }
507
-
508
- # Add optional parameters if provided
509
- if formats:
510
- scrape_params['formats'] = formats
511
- if include_tags:
512
- scrape_params['includeTags'] = include_tags
513
- if exclude_tags:
514
- scrape_params['excludeTags'] = exclude_tags
515
- if only_main_content is not None:
516
- scrape_params['onlyMainContent'] = only_main_content
517
- if wait_for:
518
- scrape_params['waitFor'] = wait_for
519
- if timeout:
520
- scrape_params['timeout'] = timeout
521
- if location:
522
- scrape_params['location'] = location.dict(exclude_none=True)
523
- if mobile is not None:
524
- scrape_params['mobile'] = mobile
525
- if skip_tls_verification is not None:
526
- scrape_params['skipTlsVerification'] = skip_tls_verification
527
- if remove_base64_images is not None:
528
- scrape_params['removeBase64Images'] = remove_base64_images
529
- if block_ads is not None:
530
- scrape_params['blockAds'] = block_ads
531
- if proxy:
532
- scrape_params['proxy'] = proxy
533
- if extract is not None:
534
- extract = self._ensure_schema_dict(extract)
535
- if isinstance(extract, dict) and "schema" in extract:
536
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
537
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
538
- if json_options is not None:
539
- json_options = self._ensure_schema_dict(json_options)
540
- if isinstance(json_options, dict) and "schema" in json_options:
541
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
542
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
543
- if actions:
544
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
545
- if change_tracking_options:
546
- scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
547
-
548
- scrape_params.update(kwargs)
549
-
550
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
551
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
552
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
553
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
554
-
555
- # Make request
556
- response = requests.post(
557
- f'{self.api_url}/v1/scrape',
558
- headers=headers,
559
- json=scrape_params,
560
- timeout=(timeout + 5000 if timeout else None)
561
- )
562
-
563
- if response.status_code == 200:
564
- try:
565
- response_json = response.json()
566
- if response_json.get('success') and 'data' in response_json:
567
- return ScrapeResponse(**response_json['data'])
568
- elif "error" in response_json:
569
- raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
570
- else:
571
- raise Exception(f'Failed to scrape URL. Error: {response_json}')
572
- except ValueError:
573
- raise Exception('Failed to parse Firecrawl response as JSON.')
574
- else:
575
- self._handle_error(response, 'scrape URL')
576
-
577
- def search(
578
- self,
579
- query: str,
580
- *,
581
- limit: Optional[int] = None,
582
- tbs: Optional[str] = None,
583
- filter: Optional[str] = None,
584
- lang: Optional[str] = None,
585
- country: Optional[str] = None,
586
- location: Optional[str] = None,
587
- timeout: Optional[int] = None,
588
- scrape_options: Optional[ScrapeOptions] = None,
589
- **kwargs) -> SearchResponse:
590
- """
591
- Search for content using Firecrawl.
592
-
593
- Args:
594
- query (str): Search query string
595
- limit (Optional[int]): Max results (default: 5)
596
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
597
- filter (Optional[str]): Custom result filter
598
- lang (Optional[str]): Language code (default: "en")
599
- country (Optional[str]): Country code (default: "us")
600
- location (Optional[str]): Geo-targeting
601
- timeout (Optional[int]): Request timeout in milliseconds
602
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
603
- **kwargs: Additional keyword arguments for future compatibility
604
-
605
- Returns:
606
- SearchResponse: Response containing:
607
- * success (bool): Whether request succeeded
608
- * data (List[FirecrawlDocument]): Search results
609
- * warning (Optional[str]): Warning message if any
610
- * error (Optional[str]): Error message if any
611
-
612
- Raises:
613
- Exception: If search fails or response cannot be parsed
614
- """
615
- # Validate any additional kwargs
616
- self._validate_kwargs(kwargs, "search")
617
-
618
- # Build search parameters
619
- search_params = {}
620
-
621
- # Add individual parameters
622
- if limit is not None:
623
- search_params['limit'] = limit
624
- if tbs is not None:
625
- search_params['tbs'] = tbs
626
- if filter is not None:
627
- search_params['filter'] = filter
628
- if lang is not None:
629
- search_params['lang'] = lang
630
- if country is not None:
631
- search_params['country'] = country
632
- if location is not None:
633
- search_params['location'] = location
634
- if timeout is not None:
635
- search_params['timeout'] = timeout
636
- if scrape_options is not None:
637
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
638
-
639
- # Add any additional kwargs
640
- search_params.update(kwargs)
641
-
642
- # Create final params object
643
- final_params = SearchParams(query=query, **search_params)
644
- params_dict = final_params.dict(exclude_none=True)
645
- params_dict['origin'] = f"python-sdk@{version}"
646
-
647
- # Make request
648
- response = requests.post(
649
- f"{self.api_url}/v1/search",
650
- headers={"Authorization": f"Bearer {self.api_key}"},
651
- json=params_dict
652
- )
653
-
654
- if response.status_code == 200:
655
- try:
656
- response_json = response.json()
657
- if response_json.get('success') and 'data' in response_json:
658
- return SearchResponse(**response_json)
659
- elif "error" in response_json:
660
- raise Exception(f'Search failed. Error: {response_json["error"]}')
661
- else:
662
- raise Exception(f'Search failed. Error: {response_json}')
663
- except ValueError:
664
- raise Exception('Failed to parse Firecrawl response as JSON.')
665
- else:
666
- self._handle_error(response, 'search')
667
-
668
- def crawl_url(
669
- self,
670
- url: str,
671
- *,
672
- include_paths: Optional[List[str]] = None,
673
- exclude_paths: Optional[List[str]] = None,
674
- max_depth: Optional[int] = None,
675
- max_discovery_depth: Optional[int] = None,
676
- limit: Optional[int] = None,
677
- allow_backward_links: Optional[bool] = None,
678
- allow_external_links: Optional[bool] = None,
679
- ignore_sitemap: Optional[bool] = None,
680
- scrape_options: Optional[ScrapeOptions] = None,
681
- webhook: Optional[Union[str, WebhookConfig]] = None,
682
- deduplicate_similar_urls: Optional[bool] = None,
683
- ignore_query_parameters: Optional[bool] = None,
684
- regex_on_full_url: Optional[bool] = None,
685
- delay: Optional[int] = None,
686
- poll_interval: Optional[int] = 2,
687
- idempotency_key: Optional[str] = None,
688
- **kwargs
689
- ) -> CrawlStatusResponse:
690
- """
691
- Crawl a website starting from a URL.
692
-
693
- Args:
694
- url (str): Target URL to start crawling from
695
- include_paths (Optional[List[str]]): Patterns of URLs to include
696
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
697
- max_depth (Optional[int]): Maximum crawl depth
698
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
699
- limit (Optional[int]): Maximum pages to crawl
700
- allow_backward_links (Optional[bool]): Follow parent directory links
701
- allow_external_links (Optional[bool]): Follow external domain links
702
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
703
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
704
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
705
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
706
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
707
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
708
- delay (Optional[int]): Delay in seconds between scrapes
709
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
710
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
711
- **kwargs: Additional parameters to pass to the API
712
-
713
- Returns:
714
- CrawlStatusResponse with:
715
- * Crawling status and progress
716
- * Crawled page contents
717
- * Success/error information
718
-
719
- Raises:
720
- Exception: If crawl fails
721
- """
722
- # Validate any additional kwargs
723
- self._validate_kwargs(kwargs, "crawl_url")
724
-
725
- crawl_params = {}
726
-
727
- # Add individual parameters
728
- if include_paths is not None:
729
- crawl_params['includePaths'] = include_paths
730
- if exclude_paths is not None:
731
- crawl_params['excludePaths'] = exclude_paths
732
- if max_depth is not None:
733
- crawl_params['maxDepth'] = max_depth
734
- if max_discovery_depth is not None:
735
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
736
- if limit is not None:
737
- crawl_params['limit'] = limit
738
- if allow_backward_links is not None:
739
- crawl_params['allowBackwardLinks'] = allow_backward_links
740
- if allow_external_links is not None:
741
- crawl_params['allowExternalLinks'] = allow_external_links
742
- if ignore_sitemap is not None:
743
- crawl_params['ignoreSitemap'] = ignore_sitemap
744
- if scrape_options is not None:
745
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
746
- if webhook is not None:
747
- crawl_params['webhook'] = webhook
748
- if deduplicate_similar_urls is not None:
749
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
750
- if ignore_query_parameters is not None:
751
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
752
- if regex_on_full_url is not None:
753
- crawl_params['regexOnFullURL'] = regex_on_full_url
754
- if delay is not None:
755
- crawl_params['delay'] = delay
756
-
757
- # Add any additional kwargs
758
- crawl_params.update(kwargs)
759
-
760
- # Create final params object
761
- final_params = CrawlParams(**crawl_params)
762
- params_dict = final_params.dict(exclude_none=True)
763
- params_dict['url'] = url
764
- params_dict['origin'] = f"python-sdk@{version}"
765
-
766
- # Make request
767
- headers = self._prepare_headers(idempotency_key)
768
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
769
-
770
- if response.status_code == 200:
771
- try:
772
- id = response.json().get('id')
773
- except:
774
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
775
- return self._monitor_job_status(id, headers, poll_interval)
776
- else:
777
- self._handle_error(response, 'start crawl job')
778
-
779
- def async_crawl_url(
780
- self,
781
- url: str,
782
- *,
783
- include_paths: Optional[List[str]] = None,
784
- exclude_paths: Optional[List[str]] = None,
785
- max_depth: Optional[int] = None,
786
- max_discovery_depth: Optional[int] = None,
787
- limit: Optional[int] = None,
788
- allow_backward_links: Optional[bool] = None,
789
- allow_external_links: Optional[bool] = None,
790
- ignore_sitemap: Optional[bool] = None,
791
- scrape_options: Optional[ScrapeOptions] = None,
792
- webhook: Optional[Union[str, WebhookConfig]] = None,
793
- deduplicate_similar_urls: Optional[bool] = None,
794
- ignore_query_parameters: Optional[bool] = None,
795
- regex_on_full_url: Optional[bool] = None,
796
- delay: Optional[int] = None,
797
- idempotency_key: Optional[str] = None,
798
- **kwargs
799
- ) -> CrawlResponse:
800
- """
801
- Start an asynchronous crawl job.
802
-
803
- Args:
804
- url (str): Target URL to start crawling from
805
- include_paths (Optional[List[str]]): Patterns of URLs to include
806
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
807
- max_depth (Optional[int]): Maximum crawl depth
808
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
809
- limit (Optional[int]): Maximum pages to crawl
810
- allow_backward_links (Optional[bool]): Follow parent directory links
811
- allow_external_links (Optional[bool]): Follow external domain links
812
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
813
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
814
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
815
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
816
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
817
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
818
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
819
- **kwargs: Additional parameters to pass to the API
820
-
821
- Returns:
822
- CrawlResponse with:
823
- * success - Whether crawl started successfully
824
- * id - Unique identifier for the crawl job
825
- * url - Status check URL for the crawl
826
- * error - Error message if start failed
827
-
828
- Raises:
829
- Exception: If crawl initiation fails
830
- """
831
- # Validate any additional kwargs
832
- self._validate_kwargs(kwargs, "async_crawl_url")
833
-
834
- crawl_params = {}
835
-
836
- # Add individual parameters
837
- if include_paths is not None:
838
- crawl_params['includePaths'] = include_paths
839
- if exclude_paths is not None:
840
- crawl_params['excludePaths'] = exclude_paths
841
- if max_depth is not None:
842
- crawl_params['maxDepth'] = max_depth
843
- if max_discovery_depth is not None:
844
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
845
- if limit is not None:
846
- crawl_params['limit'] = limit
847
- if allow_backward_links is not None:
848
- crawl_params['allowBackwardLinks'] = allow_backward_links
849
- if allow_external_links is not None:
850
- crawl_params['allowExternalLinks'] = allow_external_links
851
- if ignore_sitemap is not None:
852
- crawl_params['ignoreSitemap'] = ignore_sitemap
853
- if scrape_options is not None:
854
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
855
- if webhook is not None:
856
- crawl_params['webhook'] = webhook
857
- if deduplicate_similar_urls is not None:
858
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
859
- if ignore_query_parameters is not None:
860
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
861
- if regex_on_full_url is not None:
862
- crawl_params['regexOnFullURL'] = regex_on_full_url
863
- if delay is not None:
864
- crawl_params['delay'] = delay
865
-
866
- # Add any additional kwargs
867
- crawl_params.update(kwargs)
868
-
869
- # Create final params object
870
- final_params = CrawlParams(**crawl_params)
871
- params_dict = final_params.dict(exclude_none=True)
872
- params_dict['url'] = url
873
- params_dict['origin'] = f"python-sdk@{version}"
874
-
875
- # Make request
876
- headers = self._prepare_headers(idempotency_key)
877
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
878
-
879
- if response.status_code == 200:
880
- try:
881
- return CrawlResponse(**response.json())
882
- except:
883
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
884
- else:
885
- self._handle_error(response, 'start crawl job')
886
-
887
- def check_crawl_status(self, id: str) -> CrawlStatusResponse:
888
- """
889
- Check the status and results of a crawl job.
890
-
891
- Args:
892
- id: Unique identifier for the crawl job
893
-
894
- Returns:
895
- CrawlStatusResponse containing:
896
-
897
- Status Information:
898
- * status - Current state (scraping/completed/failed/cancelled)
899
- * completed - Number of pages crawled
900
- * total - Total pages to crawl
901
- * creditsUsed - API credits consumed
902
- * expiresAt - Data expiration timestamp
903
-
904
- Results:
905
- * data - List of crawled documents
906
- * next - URL for next page of results (if paginated)
907
- * success - Whether status check succeeded
908
- * error - Error message if failed
909
-
910
- Raises:
911
- Exception: If status check fails
912
- """
913
- endpoint = f'/v1/crawl/{id}'
914
-
915
- headers = self._prepare_headers()
916
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
917
- if response.status_code == 200:
918
- try:
919
- status_data = response.json()
920
- except:
921
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
922
- if status_data['status'] == 'completed':
923
- if 'data' in status_data:
924
- data = status_data['data']
925
- while 'next' in status_data:
926
- if len(status_data['data']) == 0:
927
- break
928
- next_url = status_data.get('next')
929
- if not next_url:
930
- logger.warning("Expected 'next' URL is missing.")
931
- break
932
- try:
933
- status_response = self._get_request(next_url, headers)
934
- if status_response.status_code != 200:
935
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
936
- break
937
- try:
938
- next_data = status_response.json()
939
- except:
940
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
941
- data.extend(next_data.get('data', []))
942
- status_data = next_data
943
- except Exception as e:
944
- logger.error(f"Error during pagination request: {e}")
945
- break
946
- status_data['data'] = data
947
-
948
- response = {
949
- 'status': status_data.get('status'),
950
- 'total': status_data.get('total'),
951
- 'completed': status_data.get('completed'),
952
- 'creditsUsed': status_data.get('creditsUsed'),
953
- 'expiresAt': status_data.get('expiresAt'),
954
- 'data': status_data.get('data')
955
- }
956
-
957
- if 'error' in status_data:
958
- response['error'] = status_data['error']
959
-
960
- if 'next' in status_data:
961
- response['next'] = status_data['next']
962
-
963
- return CrawlStatusResponse(
964
- success=False if 'error' in status_data else True,
965
- **response
966
- )
967
- else:
968
- self._handle_error(response, 'check crawl status')
969
-
970
- def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
971
- """
972
- Returns information about crawl errors.
973
-
974
- Args:
975
- id (str): The ID of the crawl job
976
-
977
- Returns:
978
- CrawlErrorsResponse containing:
979
- * errors (List[Dict[str, str]]): List of errors with fields:
980
- - id (str): Error ID
981
- - timestamp (str): When the error occurred
982
- - url (str): URL that caused the error
983
- - error (str): Error message
984
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
985
-
986
- Raises:
987
- Exception: If error check fails
988
- """
989
- headers = self._prepare_headers()
990
- response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
991
- if response.status_code == 200:
992
- try:
993
- return CrawlErrorsResponse(**response.json())
994
- except:
995
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
996
- else:
997
- self._handle_error(response, "check crawl errors")
998
-
999
- def cancel_crawl(self, id: str) -> Dict[str, Any]:
1000
- """
1001
- Cancel an asynchronous crawl job.
1002
-
1003
- Args:
1004
- id (str): The ID of the crawl job to cancel
1005
-
1006
- Returns:
1007
- Dict[str, Any] containing:
1008
- * success (bool): Whether cancellation was successful
1009
- * error (str, optional): Error message if cancellation failed
1010
-
1011
- Raises:
1012
- Exception: If cancellation fails
1013
- """
1014
- headers = self._prepare_headers()
1015
- response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1016
- if response.status_code == 200:
1017
- try:
1018
- return response.json()
1019
- except:
1020
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1021
- else:
1022
- self._handle_error(response, "cancel crawl job")
1023
-
1024
- def crawl_url_and_watch(
1025
- self,
1026
- url: str,
1027
- *,
1028
- include_paths: Optional[List[str]] = None,
1029
- exclude_paths: Optional[List[str]] = None,
1030
- max_depth: Optional[int] = None,
1031
- max_discovery_depth: Optional[int] = None,
1032
- limit: Optional[int] = None,
1033
- allow_backward_links: Optional[bool] = None,
1034
- allow_external_links: Optional[bool] = None,
1035
- ignore_sitemap: Optional[bool] = None,
1036
- scrape_options: Optional[ScrapeOptions] = None,
1037
- webhook: Optional[Union[str, WebhookConfig]] = None,
1038
- deduplicate_similar_urls: Optional[bool] = None,
1039
- ignore_query_parameters: Optional[bool] = None,
1040
- regex_on_full_url: Optional[bool] = None,
1041
- idempotency_key: Optional[str] = None,
1042
- **kwargs
1043
- ) -> 'CrawlWatcher':
1044
- """
1045
- Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1046
-
1047
- Args:
1048
- url (str): Target URL to start crawling from
1049
- include_paths (Optional[List[str]]): Patterns of URLs to include
1050
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1051
- max_depth (Optional[int]): Maximum crawl depth
1052
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1053
- limit (Optional[int]): Maximum pages to crawl
1054
- allow_backward_links (Optional[bool]): Follow parent directory links
1055
- allow_external_links (Optional[bool]): Follow external domain links
1056
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1057
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1058
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1059
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1060
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
1061
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
1062
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1063
- **kwargs: Additional parameters to pass to the API
1064
-
1065
- Returns:
1066
- CrawlWatcher: An instance to monitor the crawl job via WebSocket
1067
-
1068
- Raises:
1069
- Exception: If crawl job fails to start
1070
- """
1071
- crawl_response = self.async_crawl_url(
1072
- url,
1073
- include_paths=include_paths,
1074
- exclude_paths=exclude_paths,
1075
- max_depth=max_depth,
1076
- max_discovery_depth=max_discovery_depth,
1077
- limit=limit,
1078
- allow_backward_links=allow_backward_links,
1079
- allow_external_links=allow_external_links,
1080
- ignore_sitemap=ignore_sitemap,
1081
- scrape_options=scrape_options,
1082
- webhook=webhook,
1083
- deduplicate_similar_urls=deduplicate_similar_urls,
1084
- ignore_query_parameters=ignore_query_parameters,
1085
- regex_on_full_url=regex_on_full_url,
1086
- idempotency_key=idempotency_key,
1087
- **kwargs
1088
- )
1089
- if crawl_response.success and crawl_response.id:
1090
- return CrawlWatcher(crawl_response.id, self)
1091
- else:
1092
- raise Exception("Crawl job failed to start")
1093
-
1094
- def map_url(
1095
- self,
1096
- url: str,
1097
- *,
1098
- search: Optional[str] = None,
1099
- ignore_sitemap: Optional[bool] = None,
1100
- include_subdomains: Optional[bool] = None,
1101
- sitemap_only: Optional[bool] = None,
1102
- limit: Optional[int] = None,
1103
- timeout: Optional[int] = None,
1104
- **kwargs) -> MapResponse:
1105
- """
1106
- Map and discover links from a URL.
1107
-
1108
- Args:
1109
- url (str): Target URL to map
1110
- search (Optional[str]): Filter pattern for URLs
1111
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1112
- include_subdomains (Optional[bool]): Include subdomain links
1113
- sitemap_only (Optional[bool]): Only use sitemap.xml
1114
- limit (Optional[int]): Maximum URLs to return
1115
- timeout (Optional[int]): Request timeout in milliseconds
1116
- **kwargs: Additional parameters to pass to the API
1117
-
1118
- Returns:
1119
- MapResponse: Response containing:
1120
- * success (bool): Whether request succeeded
1121
- * links (List[str]): Discovered URLs
1122
- * error (Optional[str]): Error message if any
1123
-
1124
- Raises:
1125
- Exception: If mapping fails or response cannot be parsed
1126
- """
1127
- # Validate any additional kwargs
1128
- self._validate_kwargs(kwargs, "map_url")
1129
-
1130
- # Build map parameters
1131
- map_params = {}
1132
-
1133
- # Add individual parameters
1134
- if search is not None:
1135
- map_params['search'] = search
1136
- if ignore_sitemap is not None:
1137
- map_params['ignoreSitemap'] = ignore_sitemap
1138
- if include_subdomains is not None:
1139
- map_params['includeSubdomains'] = include_subdomains
1140
- if sitemap_only is not None:
1141
- map_params['sitemapOnly'] = sitemap_only
1142
- if limit is not None:
1143
- map_params['limit'] = limit
1144
- if timeout is not None:
1145
- map_params['timeout'] = timeout
1146
-
1147
- # Add any additional kwargs
1148
- map_params.update(kwargs)
1149
-
1150
- # Create final params object
1151
- final_params = MapParams(**map_params)
1152
- params_dict = final_params.dict(exclude_none=True)
1153
- params_dict['url'] = url
1154
- params_dict['origin'] = f"python-sdk@{version}"
1155
-
1156
- # Make request
1157
- response = requests.post(
1158
- f"{self.api_url}/v1/map",
1159
- headers={"Authorization": f"Bearer {self.api_key}"},
1160
- json=params_dict
1161
- )
1162
-
1163
- if response.status_code == 200:
1164
- try:
1165
- response_json = response.json()
1166
- if response_json.get('success') and 'links' in response_json:
1167
- return MapResponse(**response_json)
1168
- elif "error" in response_json:
1169
- raise Exception(f'Map failed. Error: {response_json["error"]}')
1170
- else:
1171
- raise Exception(f'Map failed. Error: {response_json}')
1172
- except ValueError:
1173
- raise Exception('Failed to parse Firecrawl response as JSON.')
1174
- else:
1175
- self._handle_error(response, 'map')
1176
-
1177
- def batch_scrape_urls(
1178
- self,
1179
- urls: List[str],
1180
- *,
1181
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1182
- headers: Optional[Dict[str, str]] = None,
1183
- include_tags: Optional[List[str]] = None,
1184
- exclude_tags: Optional[List[str]] = None,
1185
- only_main_content: Optional[bool] = None,
1186
- wait_for: Optional[int] = None,
1187
- timeout: Optional[int] = None,
1188
- location: Optional[LocationConfig] = None,
1189
- mobile: Optional[bool] = None,
1190
- skip_tls_verification: Optional[bool] = None,
1191
- remove_base64_images: Optional[bool] = None,
1192
- block_ads: Optional[bool] = None,
1193
- proxy: Optional[Literal["basic", "stealth"]] = None,
1194
- extract: Optional[JsonConfig] = None,
1195
- json_options: Optional[JsonConfig] = None,
1196
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1197
- agent: Optional[AgentOptions] = None,
1198
- poll_interval: Optional[int] = 2,
1199
- idempotency_key: Optional[str] = None,
1200
- **kwargs
1201
- ) -> BatchScrapeStatusResponse:
1202
- """
1203
- Batch scrape multiple URLs and monitor until completion.
1204
-
1205
- Args:
1206
- urls (List[str]): URLs to scrape
1207
- formats (Optional[List[Literal]]): Content formats to retrieve
1208
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1209
- include_tags (Optional[List[str]]): HTML tags to include
1210
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1211
- only_main_content (Optional[bool]): Extract main content only
1212
- wait_for (Optional[int]): Wait time in milliseconds
1213
- timeout (Optional[int]): Request timeout in milliseconds
1214
- location (Optional[LocationConfig]): Location configuration
1215
- mobile (Optional[bool]): Use mobile user agent
1216
- skip_tls_verification (Optional[bool]): Skip TLS verification
1217
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1218
- block_ads (Optional[bool]): Block advertisements
1219
- proxy (Optional[Literal]): Proxy type to use
1220
- extract (Optional[JsonConfig]): Content extraction config
1221
- json_options (Optional[JsonConfig]): JSON extraction config
1222
- actions (Optional[List[Union]]): Actions to perform
1223
- agent (Optional[AgentOptions]): Agent configuration
1224
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
1225
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1226
- **kwargs: Additional parameters to pass to the API
1227
-
1228
- Returns:
1229
- BatchScrapeStatusResponse with:
1230
- * Scraping status and progress
1231
- * Scraped content for each URL
1232
- * Success/error information
1233
-
1234
- Raises:
1235
- Exception: If batch scrape fails
1236
- """
1237
- # Validate any additional kwargs
1238
- self._validate_kwargs(kwargs, "batch_scrape_urls")
1239
-
1240
- scrape_params = {}
1241
-
1242
- # Add individual parameters
1243
- if formats is not None:
1244
- scrape_params['formats'] = formats
1245
- if headers is not None:
1246
- scrape_params['headers'] = headers
1247
- if include_tags is not None:
1248
- scrape_params['includeTags'] = include_tags
1249
- if exclude_tags is not None:
1250
- scrape_params['excludeTags'] = exclude_tags
1251
- if only_main_content is not None:
1252
- scrape_params['onlyMainContent'] = only_main_content
1253
- if wait_for is not None:
1254
- scrape_params['waitFor'] = wait_for
1255
- if timeout is not None:
1256
- scrape_params['timeout'] = timeout
1257
- if location is not None:
1258
- scrape_params['location'] = location.dict(exclude_none=True)
1259
- if mobile is not None:
1260
- scrape_params['mobile'] = mobile
1261
- if skip_tls_verification is not None:
1262
- scrape_params['skipTlsVerification'] = skip_tls_verification
1263
- if remove_base64_images is not None:
1264
- scrape_params['removeBase64Images'] = remove_base64_images
1265
- if block_ads is not None:
1266
- scrape_params['blockAds'] = block_ads
1267
- if proxy is not None:
1268
- scrape_params['proxy'] = proxy
1269
- if extract is not None:
1270
- extract = self._ensure_schema_dict(extract)
1271
- if isinstance(extract, dict) and "schema" in extract:
1272
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1273
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1274
- if json_options is not None:
1275
- json_options = self._ensure_schema_dict(json_options)
1276
- if isinstance(json_options, dict) and "schema" in json_options:
1277
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1278
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1279
- if actions is not None:
1280
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1281
- if agent is not None:
1282
- scrape_params['agent'] = agent.dict(exclude_none=True)
1283
-
1284
- # Add any additional kwargs
1285
- scrape_params.update(kwargs)
1286
-
1287
- # Create final params object
1288
- final_params = ScrapeParams(**scrape_params)
1289
- params_dict = final_params.dict(exclude_none=True)
1290
- params_dict['urls'] = urls
1291
- params_dict['origin'] = f"python-sdk@{version}"
1292
-
1293
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1294
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1295
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1296
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1297
-
1298
- # Make request
1299
- headers = self._prepare_headers(idempotency_key)
1300
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1301
-
1302
- if response.status_code == 200:
1303
- try:
1304
- id = response.json().get('id')
1305
- except:
1306
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1307
- return self._monitor_job_status(id, headers, poll_interval)
1308
- else:
1309
- self._handle_error(response, 'start batch scrape job')
1310
-
1311
- def async_batch_scrape_urls(
1312
- self,
1313
- urls: List[str],
1314
- *,
1315
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1316
- headers: Optional[Dict[str, str]] = None,
1317
- include_tags: Optional[List[str]] = None,
1318
- exclude_tags: Optional[List[str]] = None,
1319
- only_main_content: Optional[bool] = None,
1320
- wait_for: Optional[int] = None,
1321
- timeout: Optional[int] = None,
1322
- location: Optional[LocationConfig] = None,
1323
- mobile: Optional[bool] = None,
1324
- skip_tls_verification: Optional[bool] = None,
1325
- remove_base64_images: Optional[bool] = None,
1326
- block_ads: Optional[bool] = None,
1327
- proxy: Optional[Literal["basic", "stealth"]] = None,
1328
- extract: Optional[JsonConfig] = None,
1329
- json_options: Optional[JsonConfig] = None,
1330
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1331
- agent: Optional[AgentOptions] = None,
1332
- idempotency_key: Optional[str] = None,
1333
- **kwargs
1334
- ) -> BatchScrapeResponse:
1335
- """
1336
- Initiate a batch scrape job asynchronously.
1337
-
1338
- Args:
1339
- urls (List[str]): URLs to scrape
1340
- formats (Optional[List[Literal]]): Content formats to retrieve
1341
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1342
- include_tags (Optional[List[str]]): HTML tags to include
1343
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1344
- only_main_content (Optional[bool]): Extract main content only
1345
- wait_for (Optional[int]): Wait time in milliseconds
1346
- timeout (Optional[int]): Request timeout in milliseconds
1347
- location (Optional[LocationConfig]): Location configuration
1348
- mobile (Optional[bool]): Use mobile user agent
1349
- skip_tls_verification (Optional[bool]): Skip TLS verification
1350
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1351
- block_ads (Optional[bool]): Block advertisements
1352
- proxy (Optional[Literal]): Proxy type to use
1353
- extract (Optional[JsonConfig]): Content extraction config
1354
- json_options (Optional[JsonConfig]): JSON extraction config
1355
- actions (Optional[List[Union]]): Actions to perform
1356
- agent (Optional[AgentOptions]): Agent configuration
1357
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1358
- **kwargs: Additional parameters to pass to the API
1359
-
1360
- Returns:
1361
- BatchScrapeResponse with:
1362
- * success - Whether job started successfully
1363
- * id - Unique identifier for the job
1364
- * url - Status check URL
1365
- * error - Error message if start failed
1366
-
1367
- Raises:
1368
- Exception: If job initiation fails
1369
- """
1370
- # Validate any additional kwargs
1371
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1372
-
1373
- scrape_params = {}
1374
-
1375
- # Add individual parameters
1376
- if formats is not None:
1377
- scrape_params['formats'] = formats
1378
- if headers is not None:
1379
- scrape_params['headers'] = headers
1380
- if include_tags is not None:
1381
- scrape_params['includeTags'] = include_tags
1382
- if exclude_tags is not None:
1383
- scrape_params['excludeTags'] = exclude_tags
1384
- if only_main_content is not None:
1385
- scrape_params['onlyMainContent'] = only_main_content
1386
- if wait_for is not None:
1387
- scrape_params['waitFor'] = wait_for
1388
- if timeout is not None:
1389
- scrape_params['timeout'] = timeout
1390
- if location is not None:
1391
- scrape_params['location'] = location.dict(exclude_none=True)
1392
- if mobile is not None:
1393
- scrape_params['mobile'] = mobile
1394
- if skip_tls_verification is not None:
1395
- scrape_params['skipTlsVerification'] = skip_tls_verification
1396
- if remove_base64_images is not None:
1397
- scrape_params['removeBase64Images'] = remove_base64_images
1398
- if block_ads is not None:
1399
- scrape_params['blockAds'] = block_ads
1400
- if proxy is not None:
1401
- scrape_params['proxy'] = proxy
1402
- if extract is not None:
1403
- extract = self._ensure_schema_dict(extract)
1404
- if isinstance(extract, dict) and "schema" in extract:
1405
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1406
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1407
- if json_options is not None:
1408
- json_options = self._ensure_schema_dict(json_options)
1409
- if isinstance(json_options, dict) and "schema" in json_options:
1410
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1411
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1412
- if actions is not None:
1413
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1414
- if agent is not None:
1415
- scrape_params['agent'] = agent.dict(exclude_none=True)
1416
-
1417
- # Add any additional kwargs
1418
- scrape_params.update(kwargs)
1419
-
1420
- # Create final params object
1421
- final_params = ScrapeParams(**scrape_params)
1422
- params_dict = final_params.dict(exclude_none=True)
1423
- params_dict['urls'] = urls
1424
- params_dict['origin'] = f"python-sdk@{version}"
1425
-
1426
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1427
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1428
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1429
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1430
-
1431
- # Make request
1432
- headers = self._prepare_headers(idempotency_key)
1433
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1434
-
1435
- if response.status_code == 200:
1436
- try:
1437
- return BatchScrapeResponse(**response.json())
1438
- except:
1439
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1440
- else:
1441
- self._handle_error(response, 'start batch scrape job')
1442
-
1443
- def batch_scrape_urls_and_watch(
1444
- self,
1445
- urls: List[str],
1446
- *,
1447
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1448
- headers: Optional[Dict[str, str]] = None,
1449
- include_tags: Optional[List[str]] = None,
1450
- exclude_tags: Optional[List[str]] = None,
1451
- only_main_content: Optional[bool] = None,
1452
- wait_for: Optional[int] = None,
1453
- timeout: Optional[int] = None,
1454
- location: Optional[LocationConfig] = None,
1455
- mobile: Optional[bool] = None,
1456
- skip_tls_verification: Optional[bool] = None,
1457
- remove_base64_images: Optional[bool] = None,
1458
- block_ads: Optional[bool] = None,
1459
- proxy: Optional[Literal["basic", "stealth"]] = None,
1460
- extract: Optional[JsonConfig] = None,
1461
- json_options: Optional[JsonConfig] = None,
1462
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1463
- agent: Optional[AgentOptions] = None,
1464
- idempotency_key: Optional[str] = None,
1465
- **kwargs
1466
- ) -> 'CrawlWatcher':
1467
- """
1468
- Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1469
-
1470
- Args:
1471
- urls (List[str]): URLs to scrape
1472
- formats (Optional[List[Literal]]): Content formats to retrieve
1473
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1474
- include_tags (Optional[List[str]]): HTML tags to include
1475
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1476
- only_main_content (Optional[bool]): Extract main content only
1477
- wait_for (Optional[int]): Wait time in milliseconds
1478
- timeout (Optional[int]): Request timeout in milliseconds
1479
- location (Optional[LocationConfig]): Location configuration
1480
- mobile (Optional[bool]): Use mobile user agent
1481
- skip_tls_verification (Optional[bool]): Skip TLS verification
1482
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1483
- block_ads (Optional[bool]): Block advertisements
1484
- proxy (Optional[Literal]): Proxy type to use
1485
- extract (Optional[JsonConfig]): Content extraction config
1486
- json_options (Optional[JsonConfig]): JSON extraction config
1487
- actions (Optional[List[Union]]): Actions to perform
1488
- agent (Optional[AgentOptions]): Agent configuration
1489
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1490
- **kwargs: Additional parameters to pass to the API
1491
-
1492
- Returns:
1493
- CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1494
-
1495
- Raises:
1496
- Exception: If batch scrape job fails to start
1497
- """
1498
- # Validate any additional kwargs
1499
- self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1500
-
1501
- scrape_params = {}
1502
-
1503
- # Add individual parameters
1504
- if formats is not None:
1505
- scrape_params['formats'] = formats
1506
- if headers is not None:
1507
- scrape_params['headers'] = headers
1508
- if include_tags is not None:
1509
- scrape_params['includeTags'] = include_tags
1510
- if exclude_tags is not None:
1511
- scrape_params['excludeTags'] = exclude_tags
1512
- if only_main_content is not None:
1513
- scrape_params['onlyMainContent'] = only_main_content
1514
- if wait_for is not None:
1515
- scrape_params['waitFor'] = wait_for
1516
- if timeout is not None:
1517
- scrape_params['timeout'] = timeout
1518
- if location is not None:
1519
- scrape_params['location'] = location.dict(exclude_none=True)
1520
- if mobile is not None:
1521
- scrape_params['mobile'] = mobile
1522
- if skip_tls_verification is not None:
1523
- scrape_params['skipTlsVerification'] = skip_tls_verification
1524
- if remove_base64_images is not None:
1525
- scrape_params['removeBase64Images'] = remove_base64_images
1526
- if block_ads is not None:
1527
- scrape_params['blockAds'] = block_ads
1528
- if proxy is not None:
1529
- scrape_params['proxy'] = proxy
1530
- if extract is not None:
1531
- extract = self._ensure_schema_dict(extract)
1532
- if isinstance(extract, dict) and "schema" in extract:
1533
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1534
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1535
- if json_options is not None:
1536
- json_options = self._ensure_schema_dict(json_options)
1537
- if isinstance(json_options, dict) and "schema" in json_options:
1538
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1539
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1540
- if actions is not None:
1541
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1542
- if agent is not None:
1543
- scrape_params['agent'] = agent.dict(exclude_none=True)
1544
-
1545
- # Add any additional kwargs
1546
- scrape_params.update(kwargs)
1547
-
1548
- # Create final params object
1549
- final_params = ScrapeParams(**scrape_params)
1550
- params_dict = final_params.dict(exclude_none=True)
1551
- params_dict['urls'] = urls
1552
- params_dict['origin'] = f"python-sdk@{version}"
1553
-
1554
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1555
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1556
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1557
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1558
-
1559
- # Make request
1560
- headers = self._prepare_headers(idempotency_key)
1561
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1562
-
1563
- if response.status_code == 200:
1564
- try:
1565
- crawl_response = BatchScrapeResponse(**response.json())
1566
- if crawl_response.success and crawl_response.id:
1567
- return CrawlWatcher(crawl_response.id, self)
1568
- else:
1569
- raise Exception("Batch scrape job failed to start")
1570
- except:
1571
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1572
- else:
1573
- self._handle_error(response, 'start batch scrape job')
1574
-
1575
- def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1576
- """
1577
- Check the status of a batch scrape job using the Firecrawl API.
1578
-
1579
- Args:
1580
- id (str): The ID of the batch scrape job.
1581
-
1582
- Returns:
1583
- BatchScrapeStatusResponse: The status of the batch scrape job.
1584
-
1585
- Raises:
1586
- Exception: If the status check request fails.
1587
- """
1588
- endpoint = f'/v1/batch/scrape/{id}'
1589
-
1590
- headers = self._prepare_headers()
1591
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
1592
- if response.status_code == 200:
1593
- try:
1594
- status_data = response.json()
1595
- except:
1596
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1597
- if status_data['status'] == 'completed':
1598
- if 'data' in status_data:
1599
- data = status_data['data']
1600
- while 'next' in status_data:
1601
- if len(status_data['data']) == 0:
1602
- break
1603
- next_url = status_data.get('next')
1604
- if not next_url:
1605
- logger.warning("Expected 'next' URL is missing.")
1606
- break
1607
- try:
1608
- status_response = self._get_request(next_url, headers)
1609
- if status_response.status_code != 200:
1610
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
1611
- break
1612
- try:
1613
- next_data = status_response.json()
1614
- except:
1615
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1616
- data.extend(next_data.get('data', []))
1617
- status_data = next_data
1618
- except Exception as e:
1619
- logger.error(f"Error during pagination request: {e}")
1620
- break
1621
- status_data['data'] = data
1622
-
1623
- return BatchScrapeStatusResponse(**{
1624
- 'success': False if 'error' in status_data else True,
1625
- 'status': status_data.get('status'),
1626
- 'total': status_data.get('total'),
1627
- 'completed': status_data.get('completed'),
1628
- 'creditsUsed': status_data.get('creditsUsed'),
1629
- 'expiresAt': status_data.get('expiresAt'),
1630
- 'data': status_data.get('data'),
1631
- 'next': status_data.get('next'),
1632
- 'error': status_data.get('error')
1633
- })
1634
- else:
1635
- self._handle_error(response, 'check batch scrape status')
1636
-
1637
- def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1638
- """
1639
- Returns information about batch scrape errors.
1640
-
1641
- Args:
1642
- id (str): The ID of the crawl job.
1643
-
1644
- Returns:
1645
- CrawlErrorsResponse containing:
1646
- * errors (List[Dict[str, str]]): List of errors with fields:
1647
- * id (str): Error ID
1648
- * timestamp (str): When the error occurred
1649
- * url (str): URL that caused the error
1650
- * error (str): Error message
1651
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1652
-
1653
- Raises:
1654
- Exception: If the error check request fails
1655
- """
1656
- headers = self._prepare_headers()
1657
- response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1658
- if response.status_code == 200:
1659
- try:
1660
- return CrawlErrorsResponse(**response.json())
1661
- except:
1662
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1663
- else:
1664
- self._handle_error(response, "check batch scrape errors")
1665
-
1666
- def extract(
1667
- self,
1668
- urls: Optional[List[str]] = None,
1669
- *,
1670
- prompt: Optional[str] = None,
1671
- schema: Optional[Any] = None,
1672
- system_prompt: Optional[str] = None,
1673
- allow_external_links: Optional[bool] = False,
1674
- enable_web_search: Optional[bool] = False,
1675
- show_sources: Optional[bool] = False,
1676
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1677
- """
1678
- Extract structured information from URLs.
1679
-
1680
- Args:
1681
- urls (Optional[List[str]]): URLs to extract from
1682
- prompt (Optional[str]): Custom extraction prompt
1683
- schema (Optional[Any]): JSON schema/Pydantic model
1684
- system_prompt (Optional[str]): System context
1685
- allow_external_links (Optional[bool]): Follow external links
1686
- enable_web_search (Optional[bool]): Enable web search
1687
- show_sources (Optional[bool]): Include source URLs
1688
- agent (Optional[Dict[str, Any]]): Agent configuration
1689
-
1690
- Returns:
1691
- ExtractResponse[Any] with:
1692
- * success (bool): Whether request succeeded
1693
- * data (Optional[Any]): Extracted data matching schema
1694
- * error (Optional[str]): Error message if any
1695
-
1696
- Raises:
1697
- ValueError: If prompt/schema missing or extraction fails
1698
- """
1699
- headers = self._prepare_headers()
1700
-
1701
- if not prompt and not schema:
1702
- raise ValueError("Either prompt or schema is required")
1703
-
1704
- if not urls and not prompt:
1705
- raise ValueError("Either urls or prompt is required")
1706
-
1707
- if schema:
1708
- schema = self._ensure_schema_dict(schema)
1709
-
1710
- request_data = {
1711
- 'urls': urls or [],
1712
- 'allowExternalLinks': allow_external_links,
1713
- 'enableWebSearch': enable_web_search,
1714
- 'showSources': show_sources,
1715
- 'schema': schema,
1716
- 'origin': f'python-sdk@{get_version()}'
1717
- }
1718
-
1719
- # Only add prompt and systemPrompt if they exist
1720
- if prompt:
1721
- request_data['prompt'] = prompt
1722
- if system_prompt:
1723
- request_data['systemPrompt'] = system_prompt
1724
-
1725
- if agent:
1726
- request_data['agent'] = agent
1727
-
1728
- try:
1729
- # Send the initial extract request
1730
- response = self._post_request(
1731
- f'{self.api_url}/v1/extract',
1732
- request_data,
1733
- headers
1734
- )
1735
- if response.status_code == 200:
1736
- try:
1737
- data = response.json()
1738
- except:
1739
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1740
- if data['success']:
1741
- job_id = data.get('id')
1742
- if not job_id:
1743
- raise Exception('Job ID not returned from extract request.')
1744
-
1745
- # Poll for the extract status
1746
- while True:
1747
- status_response = self._get_request(
1748
- f'{self.api_url}/v1/extract/{job_id}',
1749
- headers
1750
- )
1751
- if status_response.status_code == 200:
1752
- try:
1753
- status_data = status_response.json()
1754
- except:
1755
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1756
- if status_data['status'] == 'completed':
1757
- return ExtractResponse(**status_data)
1758
- elif status_data['status'] in ['failed', 'cancelled']:
1759
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1760
- else:
1761
- self._handle_error(status_response, "extract-status")
1762
-
1763
- time.sleep(2) # Polling interval
1764
- else:
1765
- raise Exception(f'Failed to extract. Error: {data["error"]}')
1766
- else:
1767
- self._handle_error(response, "extract")
1768
- except Exception as e:
1769
- raise ValueError(str(e), 500)
1770
-
1771
- return ExtractResponse(success=False, error="Internal server error.")
1772
-
1773
- def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1774
- """
1775
- Retrieve the status of an extract job.
1776
-
1777
- Args:
1778
- job_id (str): The ID of the extract job.
1779
-
1780
- Returns:
1781
- ExtractResponse[Any]: The status of the extract job.
1782
-
1783
- Raises:
1784
- ValueError: If there is an error retrieving the status.
1785
- """
1786
- headers = self._prepare_headers()
1787
- try:
1788
- response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1789
- if response.status_code == 200:
1790
- try:
1791
- return ExtractResponse(**response.json())
1792
- except:
1793
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1794
- else:
1795
- self._handle_error(response, "get extract status")
1796
- except Exception as e:
1797
- raise ValueError(str(e), 500)
1798
-
1799
- def async_extract(
1800
- self,
1801
- urls: Optional[List[str]] = None,
1802
- *,
1803
- prompt: Optional[str] = None,
1804
- schema: Optional[Any] = None,
1805
- system_prompt: Optional[str] = None,
1806
- allow_external_links: Optional[bool] = False,
1807
- enable_web_search: Optional[bool] = False,
1808
- show_sources: Optional[bool] = False,
1809
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1810
- """
1811
- Initiate an asynchronous extract job.
1812
-
1813
- Args:
1814
- urls (List[str]): URLs to extract information from
1815
- prompt (Optional[str]): Custom extraction prompt
1816
- schema (Optional[Any]): JSON schema/Pydantic model
1817
- system_prompt (Optional[str]): System context
1818
- allow_external_links (Optional[bool]): Follow external links
1819
- enable_web_search (Optional[bool]): Enable web search
1820
- show_sources (Optional[bool]): Include source URLs
1821
- agent (Optional[Dict[str, Any]]): Agent configuration
1822
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1823
-
1824
- Returns:
1825
- ExtractResponse[Any] with:
1826
- * success (bool): Whether request succeeded
1827
- * data (Optional[Any]): Extracted data matching schema
1828
- * error (Optional[str]): Error message if any
1829
-
1830
- Raises:
1831
- ValueError: If job initiation fails
1832
- """
1833
- headers = self._prepare_headers()
1834
-
1835
- schema = schema
1836
- if schema:
1837
- schema = self._ensure_schema_dict(schema)
1838
-
1839
- request_data = {
1840
- 'urls': urls,
1841
- 'allowExternalLinks': allow_external_links,
1842
- 'enableWebSearch': enable_web_search,
1843
- 'showSources': show_sources,
1844
- 'schema': schema,
1845
- 'origin': f'python-sdk@{version}'
1846
- }
1847
-
1848
- if prompt:
1849
- request_data['prompt'] = prompt
1850
- if system_prompt:
1851
- request_data['systemPrompt'] = system_prompt
1852
- if agent:
1853
- request_data['agent'] = agent
1854
-
1855
- try:
1856
- response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1857
- if response.status_code == 200:
1858
- try:
1859
- return ExtractResponse(**response.json())
1860
- except:
1861
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1862
- else:
1863
- self._handle_error(response, "async extract")
1864
- except Exception as e:
1865
- raise ValueError(str(e), 500)
1866
-
1867
- def generate_llms_text(
1868
- self,
1869
- url: str,
1870
- *,
1871
- max_urls: Optional[int] = None,
1872
- show_full_text: Optional[bool] = None,
1873
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1874
- """
1875
- Generate LLMs.txt for a given URL and poll until completion.
1876
-
1877
- Args:
1878
- url (str): Target URL to generate LLMs.txt from
1879
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1880
- show_full_text (Optional[bool]): Include full text in output (default: False)
1881
- experimental_stream (Optional[bool]): Enable experimental streaming
1882
-
1883
- Returns:
1884
- GenerateLLMsTextStatusResponse with:
1885
- * Generated LLMs.txt content
1886
- * Full version if requested
1887
- * Generation status
1888
- * Success/error information
1889
-
1890
- Raises:
1891
- Exception: If generation fails
1892
- """
1893
- params = GenerateLLMsTextParams(
1894
- maxUrls=max_urls,
1895
- showFullText=show_full_text,
1896
- __experimental_stream=experimental_stream
1897
- )
1898
-
1899
- response = self.async_generate_llms_text(
1900
- url,
1901
- max_urls=max_urls,
1902
- show_full_text=show_full_text,
1903
- experimental_stream=experimental_stream
1904
- )
1905
-
1906
- if not response.success or not response.id:
1907
- return GenerateLLMsTextStatusResponse(
1908
- success=False,
1909
- error='Failed to start LLMs.txt generation',
1910
- status='failed',
1911
- expiresAt=''
1912
- )
1913
-
1914
- job_id = response.id
1915
- while True:
1916
- status = self.check_generate_llms_text_status(job_id)
1917
-
1918
- if status.status == 'completed':
1919
- return status
1920
- elif status.status == 'failed':
1921
- return status
1922
- elif status.status != 'processing':
1923
- return GenerateLLMsTextStatusResponse(
1924
- success=False,
1925
- error='LLMs.txt generation job terminated unexpectedly',
1926
- status='failed',
1927
- expiresAt=''
1928
- )
1929
-
1930
- time.sleep(2) # Polling interval
1931
-
1932
- def async_generate_llms_text(
1933
- self,
1934
- url: str,
1935
- *,
1936
- max_urls: Optional[int] = None,
1937
- show_full_text: Optional[bool] = None,
1938
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1939
- """
1940
- Initiate an asynchronous LLMs.txt generation operation.
1941
-
1942
- Args:
1943
- url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1944
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1945
- show_full_text (Optional[bool]): Include full text in output (default: False)
1946
- experimental_stream (Optional[bool]): Enable experimental streaming
1947
-
1948
- Returns:
1949
- GenerateLLMsTextResponse: A response containing:
1950
- * success (bool): Whether the generation initiation was successful
1951
- * id (str): The unique identifier for the generation job
1952
- * error (str, optional): Error message if initiation failed
1953
-
1954
- Raises:
1955
- Exception: If the generation job initiation fails.
1956
- """
1957
- params = GenerateLLMsTextParams(
1958
- maxUrls=max_urls,
1959
- showFullText=show_full_text,
1960
- __experimental_stream=experimental_stream
1961
- )
1962
-
1963
- headers = self._prepare_headers()
1964
- json_data = {'url': url, **params.dict(exclude_none=True)}
1965
- json_data['origin'] = f"python-sdk@{version}"
1966
-
1967
- try:
1968
- req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1969
- response = req.json()
1970
- print("json_data", json_data)
1971
- print("response", response)
1972
- if response.get('success'):
1973
- try:
1974
- return GenerateLLMsTextResponse(**response)
1975
- except:
1976
- raise Exception('Failed to parse Firecrawl response as JSON.')
1977
- else:
1978
- self._handle_error(response, 'start LLMs.txt generation')
1979
- except Exception as e:
1980
- raise ValueError(str(e))
1981
-
1982
- return GenerateLLMsTextResponse(
1983
- success=False,
1984
- error='Internal server error'
1985
- )
1986
-
1987
- def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
1988
- """
1989
- Check the status of a LLMs.txt generation operation.
1990
-
1991
- Args:
1992
- id (str): The unique identifier of the LLMs.txt generation job to check status for.
1993
-
1994
- Returns:
1995
- GenerateLLMsTextStatusResponse: A response containing:
1996
- * success (bool): Whether the generation was successful
1997
- * status (str): Status of generation ("processing", "completed", "failed")
1998
- * data (Dict[str, str], optional): Generated text with fields:
1999
- * llmstxt (str): Generated LLMs.txt content
2000
- * llmsfulltxt (str, optional): Full version if requested
2001
- * error (str, optional): Error message if generation failed
2002
- * expiresAt (str): When the generated data expires
2003
-
2004
- Raises:
2005
- Exception: If the status check fails.
2006
- """
2007
- headers = self._prepare_headers()
2008
- try:
2009
- response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2010
- if response.status_code == 200:
2011
- try:
2012
- json_data = response.json()
2013
- return GenerateLLMsTextStatusResponse(**json_data)
2014
- except Exception as e:
2015
- raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2016
- elif response.status_code == 404:
2017
- raise Exception('LLMs.txt generation job not found')
2018
- else:
2019
- self._handle_error(response, 'check LLMs.txt generation status')
2020
- except Exception as e:
2021
- raise ValueError(str(e))
2022
-
2023
- return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2024
-
2025
- def _prepare_headers(
2026
- self,
2027
- idempotency_key: Optional[str] = None) -> Dict[str, str]:
2028
- """
2029
- Prepare the headers for API requests.
2030
-
2031
- Args:
2032
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2033
-
2034
- Returns:
2035
- Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2036
- """
2037
- if idempotency_key:
2038
- return {
2039
- 'Content-Type': 'application/json',
2040
- 'Authorization': f'Bearer {self.api_key}',
2041
- 'x-idempotency-key': idempotency_key
2042
- }
2043
-
2044
- return {
2045
- 'Content-Type': 'application/json',
2046
- 'Authorization': f'Bearer {self.api_key}',
2047
- }
2048
-
2049
- def _post_request(
2050
- self,
2051
- url: str,
2052
- data: Dict[str, Any],
2053
- headers: Dict[str, str],
2054
- retries: int = 3,
2055
- backoff_factor: float = 0.5) -> requests.Response:
2056
- """
2057
- Make a POST request with retries.
2058
-
2059
- Args:
2060
- url (str): The URL to send the POST request to.
2061
- data (Dict[str, Any]): The JSON data to include in the POST request.
2062
- headers (Dict[str, str]): The headers to include in the POST request.
2063
- retries (int): Number of retries for the request.
2064
- backoff_factor (float): Backoff factor for retries.
2065
-
2066
- Returns:
2067
- requests.Response: The response from the POST request.
2068
-
2069
- Raises:
2070
- requests.RequestException: If the request fails after the specified retries.
2071
- """
2072
- for attempt in range(retries):
2073
- response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2074
- if response.status_code == 502:
2075
- time.sleep(backoff_factor * (2 ** attempt))
2076
- else:
2077
- return response
2078
- return response
2079
-
2080
- def _get_request(
2081
- self,
2082
- url: str,
2083
- headers: Dict[str, str],
2084
- retries: int = 3,
2085
- backoff_factor: float = 0.5) -> requests.Response:
2086
- """
2087
- Make a GET request with retries.
2088
-
2089
- Args:
2090
- url (str): The URL to send the GET request to.
2091
- headers (Dict[str, str]): The headers to include in the GET request.
2092
- retries (int): Number of retries for the request.
2093
- backoff_factor (float): Backoff factor for retries.
2094
-
2095
- Returns:
2096
- requests.Response: The response from the GET request.
2097
-
2098
- Raises:
2099
- requests.RequestException: If the request fails after the specified retries.
2100
- """
2101
- for attempt in range(retries):
2102
- response = requests.get(url, headers=headers)
2103
- if response.status_code == 502:
2104
- time.sleep(backoff_factor * (2 ** attempt))
2105
- else:
2106
- return response
2107
- return response
2108
-
2109
- def _delete_request(
2110
- self,
2111
- url: str,
2112
- headers: Dict[str, str],
2113
- retries: int = 3,
2114
- backoff_factor: float = 0.5) -> requests.Response:
2115
- """
2116
- Make a DELETE request with retries.
2117
-
2118
- Args:
2119
- url (str): The URL to send the DELETE request to.
2120
- headers (Dict[str, str]): The headers to include in the DELETE request.
2121
- retries (int): Number of retries for the request.
2122
- backoff_factor (float): Backoff factor for retries.
2123
-
2124
- Returns:
2125
- requests.Response: The response from the DELETE request.
2126
-
2127
- Raises:
2128
- requests.RequestException: If the request fails after the specified retries.
2129
- """
2130
- for attempt in range(retries):
2131
- response = requests.delete(url, headers=headers)
2132
- if response.status_code == 502:
2133
- time.sleep(backoff_factor * (2 ** attempt))
2134
- else:
2135
- return response
2136
- return response
2137
-
2138
- def _monitor_job_status(
2139
- self,
2140
- id: str,
2141
- headers: Dict[str, str],
2142
- poll_interval: int) -> CrawlStatusResponse:
2143
- """
2144
- Monitor the status of a crawl job until completion.
2145
-
2146
- Args:
2147
- id (str): The ID of the crawl job.
2148
- headers (Dict[str, str]): The headers to include in the status check requests.
2149
- poll_interval (int): Seconds between status checks.
2150
-
2151
- Returns:
2152
- CrawlStatusResponse: The crawl results if the job is completed successfully.
2153
-
2154
- Raises:
2155
- Exception: If the job fails or an error occurs during status checks.
2156
- """
2157
- while True:
2158
- api_url = f'{self.api_url}/v1/crawl/{id}'
2159
-
2160
- status_response = self._get_request(api_url, headers)
2161
- if status_response.status_code == 200:
2162
- try:
2163
- status_data = status_response.json()
2164
- except:
2165
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2166
- if status_data['status'] == 'completed':
2167
- if 'data' in status_data:
2168
- data = status_data['data']
2169
- while 'next' in status_data:
2170
- if len(status_data['data']) == 0:
2171
- break
2172
- status_response = self._get_request(status_data['next'], headers)
2173
- try:
2174
- status_data = status_response.json()
2175
- except:
2176
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2177
- data.extend(status_data.get('data', []))
2178
- status_data['data'] = data
2179
- return CrawlStatusResponse(**status_data)
2180
- else:
2181
- raise Exception('Crawl job completed but no data was returned')
2182
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2183
- poll_interval=max(poll_interval,2)
2184
- time.sleep(poll_interval) # Wait for the specified interval before checking again
2185
- else:
2186
- raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2187
- else:
2188
- self._handle_error(status_response, 'check crawl status')
2189
-
2190
- def _handle_error(
2191
- self,
2192
- response: requests.Response,
2193
- action: str) -> None:
2194
- """
2195
- Handle errors from API responses.
2196
-
2197
- Args:
2198
- response (requests.Response): The response object from the API request.
2199
- action (str): Description of the action that was being performed.
2200
-
2201
- Raises:
2202
- Exception: An exception with a message containing the status code and error details from the response.
2203
- """
2204
- try:
2205
- error_message = response.json().get('error', 'No error message provided.')
2206
- error_details = response.json().get('details', 'No additional error details provided.')
2207
- except:
2208
- raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2209
-
2210
- message = self._get_error_message(response.status_code, action, error_message, error_details)
2211
-
2212
- # Raise an HTTPError with the custom message and attach the response
2213
- raise requests.exceptions.HTTPError(message, response=response)
2214
-
2215
- def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2216
- """
2217
- Generate a standardized error message based on HTTP status code.
2218
-
2219
- Args:
2220
- status_code (int): The HTTP status code from the response
2221
- action (str): Description of the action that was being performed
2222
- error_message (str): The error message from the API response
2223
- error_details (str): Additional error details from the API response
2224
-
2225
- Returns:
2226
- str: A formatted error message
2227
- """
2228
- if status_code == 402:
2229
- return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2230
- elif status_code == 403:
2231
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2232
- elif status_code == 408:
2233
- return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2234
- elif status_code == 409:
2235
- return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2236
- elif status_code == 500:
2237
- return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2238
- else:
2239
- return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2240
-
2241
- def deep_research(
2242
- self,
2243
- query: str,
2244
- *,
2245
- max_depth: Optional[int] = None,
2246
- time_limit: Optional[int] = None,
2247
- max_urls: Optional[int] = None,
2248
- analysis_prompt: Optional[str] = None,
2249
- system_prompt: Optional[str] = None,
2250
- __experimental_stream_steps: Optional[bool] = None,
2251
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2252
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2253
- """
2254
- Initiates a deep research operation on a given query and polls until completion.
2255
-
2256
- Args:
2257
- query (str): Research query or topic to investigate
2258
- max_depth (Optional[int]): Maximum depth of research exploration
2259
- time_limit (Optional[int]): Time limit in seconds for research
2260
- max_urls (Optional[int]): Maximum number of URLs to process
2261
- analysis_prompt (Optional[str]): Custom prompt for analysis
2262
- system_prompt (Optional[str]): Custom system prompt
2263
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2264
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2265
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2266
-
2267
- Returns:
2268
- DeepResearchStatusResponse containing:
2269
- * success (bool): Whether research completed successfully
2270
- * status (str): Current state (processing/completed/failed)
2271
- * error (Optional[str]): Error message if failed
2272
- * id (str): Unique identifier for the research job
2273
- * data (Any): Research findings and analysis
2274
- * sources (List[Dict]): List of discovered sources
2275
- * activities (List[Dict]): Research progress log
2276
- * summaries (List[str]): Generated research summaries
2277
-
2278
- Raises:
2279
- Exception: If research fails
2280
- """
2281
- research_params = {}
2282
- if max_depth is not None:
2283
- research_params['maxDepth'] = max_depth
2284
- if time_limit is not None:
2285
- research_params['timeLimit'] = time_limit
2286
- if max_urls is not None:
2287
- research_params['maxUrls'] = max_urls
2288
- if analysis_prompt is not None:
2289
- research_params['analysisPrompt'] = analysis_prompt
2290
- if system_prompt is not None:
2291
- research_params['systemPrompt'] = system_prompt
2292
- if __experimental_stream_steps is not None:
2293
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2294
- research_params = DeepResearchParams(**research_params)
2295
-
2296
- response = self.async_deep_research(
2297
- query,
2298
- max_depth=max_depth,
2299
- time_limit=time_limit,
2300
- max_urls=max_urls,
2301
- analysis_prompt=analysis_prompt,
2302
- system_prompt=system_prompt
2303
- )
2304
- if not response.get('success') or 'id' not in response:
2305
- return response
2306
-
2307
- job_id = response['id']
2308
- last_activity_count = 0
2309
- last_source_count = 0
2310
-
2311
- while True:
2312
- status = self.check_deep_research_status(job_id)
2313
-
2314
- if on_activity and 'activities' in status:
2315
- new_activities = status['activities'][last_activity_count:]
2316
- for activity in new_activities:
2317
- on_activity(activity)
2318
- last_activity_count = len(status['activities'])
2319
-
2320
- if on_source and 'sources' in status:
2321
- new_sources = status['sources'][last_source_count:]
2322
- for source in new_sources:
2323
- on_source(source)
2324
- last_source_count = len(status['sources'])
2325
-
2326
- if status['status'] == 'completed':
2327
- return status
2328
- elif status['status'] == 'failed':
2329
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
2330
- elif status['status'] != 'processing':
2331
- break
2332
-
2333
- time.sleep(2) # Polling interval
2334
-
2335
- return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2336
-
2337
- def async_deep_research(
2338
- self,
2339
- query: str,
2340
- *,
2341
- max_depth: Optional[int] = None,
2342
- time_limit: Optional[int] = None,
2343
- max_urls: Optional[int] = None,
2344
- analysis_prompt: Optional[str] = None,
2345
- system_prompt: Optional[str] = None,
2346
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2347
- """
2348
- Initiates an asynchronous deep research operation.
2349
-
2350
- Args:
2351
- query (str): Research query or topic to investigate
2352
- max_depth (Optional[int]): Maximum depth of research exploration
2353
- time_limit (Optional[int]): Time limit in seconds for research
2354
- max_urls (Optional[int]): Maximum number of URLs to process
2355
- analysis_prompt (Optional[str]): Custom prompt for analysis
2356
- system_prompt (Optional[str]): Custom system prompt
2357
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2358
-
2359
- Returns:
2360
- Dict[str, Any]: A response containing:
2361
- * success (bool): Whether the research initiation was successful
2362
- * id (str): The unique identifier for the research job
2363
- * error (str, optional): Error message if initiation failed
2364
-
2365
- Raises:
2366
- Exception: If the research initiation fails.
2367
- """
2368
- research_params = {}
2369
- if max_depth is not None:
2370
- research_params['maxDepth'] = max_depth
2371
- if time_limit is not None:
2372
- research_params['timeLimit'] = time_limit
2373
- if max_urls is not None:
2374
- research_params['maxUrls'] = max_urls
2375
- if analysis_prompt is not None:
2376
- research_params['analysisPrompt'] = analysis_prompt
2377
- if system_prompt is not None:
2378
- research_params['systemPrompt'] = system_prompt
2379
- if __experimental_stream_steps is not None:
2380
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2381
- research_params = DeepResearchParams(**research_params)
2382
-
2383
- headers = self._prepare_headers()
2384
-
2385
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
2386
- json_data['origin'] = f"python-sdk@{version}"
2387
-
2388
- # Handle json options schema if present
2389
- if 'jsonOptions' in json_data:
2390
- json_opts = json_data['jsonOptions']
2391
- if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2392
- json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2393
-
2394
- try:
2395
- response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2396
- if response.status_code == 200:
2397
- try:
2398
- return response.json()
2399
- except:
2400
- raise Exception('Failed to parse Firecrawl response as JSON.')
2401
- else:
2402
- self._handle_error(response, 'start deep research')
2403
- except Exception as e:
2404
- raise ValueError(str(e))
2405
-
2406
- return {'success': False, 'error': 'Internal server error'}
2407
-
2408
- def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2409
- """
2410
- Check the status of a deep research operation.
2411
-
2412
- Args:
2413
- id (str): The ID of the deep research operation.
2414
-
2415
- Returns:
2416
- DeepResearchResponse containing:
2417
-
2418
- Status:
2419
- * success - Whether research completed successfully
2420
- * status - Current state (processing/completed/failed)
2421
- * error - Error message if failed
2422
-
2423
- Results:
2424
- * id - Unique identifier for the research job
2425
- * data - Research findings and analysis
2426
- * sources - List of discovered sources
2427
- * activities - Research progress log
2428
- * summaries - Generated research summaries
2429
-
2430
- Raises:
2431
- Exception: If the status check fails.
2432
- """
2433
- headers = self._prepare_headers()
2434
- try:
2435
- response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2436
- if response.status_code == 200:
2437
- try:
2438
- return response.json()
2439
- except:
2440
- raise Exception('Failed to parse Firecrawl response as JSON.')
2441
- elif response.status_code == 404:
2442
- raise Exception('Deep research job not found')
2443
- else:
2444
- self._handle_error(response, 'check deep research status')
2445
- except Exception as e:
2446
- raise ValueError(str(e))
2447
-
2448
- return {'success': False, 'error': 'Internal server error'}
2449
-
2450
- def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2451
- """
2452
- Validate additional keyword arguments before they are passed to the API.
2453
- This provides early validation before the Pydantic model validation.
2454
-
2455
- Args:
2456
- kwargs (Dict[str, Any]): Additional keyword arguments to validate
2457
- method_name (str): Name of the method these kwargs are for
2458
-
2459
- Raises:
2460
- ValueError: If kwargs contain invalid or unsupported parameters
2461
- """
2462
- if not kwargs:
2463
- return
2464
-
2465
- # Known parameter mappings for each method
2466
- method_params = {
2467
- "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2468
- "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2469
- "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
2470
- "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2471
- "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2472
- "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2473
- "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2474
- "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2475
- "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2476
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2477
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2478
- "actions", "agent", "webhook"},
2479
- "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2480
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2481
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2482
- "actions", "agent", "webhook"},
2483
- "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2484
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2485
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2486
- "actions", "agent", "webhook"}
2487
- }
2488
-
2489
- # Get allowed parameters for this method
2490
- allowed_params = method_params.get(method_name, set())
2491
-
2492
- # Check for unknown parameters
2493
- unknown_params = set(kwargs.keys()) - allowed_params
2494
- if unknown_params:
2495
- raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2496
-
2497
- # Additional type validation can be added here if needed
2498
- # For now, we rely on Pydantic models for detailed type validation
2499
-
2500
- def _ensure_schema_dict(self, schema):
2501
- """
2502
- Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2503
- """
2504
- if schema is None:
2505
- return schema
2506
- if isinstance(schema, type):
2507
- # Pydantic v1/v2 model class
2508
- if hasattr(schema, 'model_json_schema'):
2509
- return schema.model_json_schema()
2510
- elif hasattr(schema, 'schema'):
2511
- return schema.schema()
2512
- if isinstance(schema, dict):
2513
- return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2514
- if isinstance(schema, (list, tuple)):
2515
- return [self._ensure_schema_dict(v) for v in schema]
2516
- return schema
2517
-
2518
- class CrawlWatcher:
2519
- """
2520
- A class to watch and handle crawl job events via WebSocket connection.
2521
-
2522
- Attributes:
2523
- id (str): The ID of the crawl job to watch
2524
- app (FirecrawlApp): The FirecrawlApp instance
2525
- data (List[Dict[str, Any]]): List of crawled documents/data
2526
- status (str): Current status of the crawl job
2527
- ws_url (str): WebSocket URL for the crawl job
2528
- event_handlers (dict): Dictionary of event type to list of handler functions
2529
- """
2530
- def __init__(self, id: str, app: FirecrawlApp):
2531
- self.id = id
2532
- self.app = app
2533
- self.data: List[Dict[str, Any]] = []
2534
- self.status = "scraping"
2535
- self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2536
- self.event_handlers = {
2537
- 'done': [],
2538
- 'error': [],
2539
- 'document': []
2540
- }
2541
-
2542
- async def connect(self) -> None:
2543
- """
2544
- Establishes WebSocket connection and starts listening for messages.
2545
- """
2546
- async with websockets.connect(
2547
- self.ws_url,
2548
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2549
- ) as websocket:
2550
- await self._listen(websocket)
2551
-
2552
- async def _listen(self, websocket) -> None:
2553
- """
2554
- Listens for incoming WebSocket messages and handles them.
2555
-
2556
- Args:
2557
- websocket: The WebSocket connection object
2558
- """
2559
- async for message in websocket:
2560
- msg = json.loads(message)
2561
- await self._handle_message(msg)
2562
-
2563
- def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2564
- """
2565
- Adds an event handler function for a specific event type.
2566
-
2567
- Args:
2568
- event_type (str): Type of event to listen for ('done', 'error', or 'document')
2569
- handler (Callable): Function to handle the event
2570
- """
2571
- if event_type in self.event_handlers:
2572
- self.event_handlers[event_type].append(handler)
2573
-
2574
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2575
- """
2576
- Dispatches an event to all registered handlers for that event type.
2577
-
2578
- Args:
2579
- event_type (str): Type of event to dispatch
2580
- detail (Dict[str, Any]): Event details/data to pass to handlers
2581
- """
2582
- if event_type in self.event_handlers:
2583
- for handler in self.event_handlers[event_type]:
2584
- handler(detail)
2585
-
2586
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
2587
- """
2588
- Handles incoming WebSocket messages based on their type.
2589
-
2590
- Args:
2591
- msg (Dict[str, Any]): The message to handle
2592
- """
2593
- if msg['type'] == 'done':
2594
- self.status = 'completed'
2595
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2596
- elif msg['type'] == 'error':
2597
- self.status = 'failed'
2598
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2599
- elif msg['type'] == 'catchup':
2600
- self.status = msg['data']['status']
2601
- self.data.extend(msg['data'].get('data', []))
2602
- for doc in self.data:
2603
- self.dispatch_event('document', {'data': doc, 'id': self.id})
2604
- elif msg['type'] == 'document':
2605
- self.data.append(msg['data'])
2606
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2607
-
2608
- class AsyncFirecrawlApp(FirecrawlApp):
2609
- """
2610
- Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2611
- Provides non-blocking alternatives to all FirecrawlApp operations.
2612
- """
2613
-
2614
- async def _async_request(
2615
- self,
2616
- method: str,
2617
- url: str,
2618
- headers: Dict[str, str],
2619
- data: Optional[Dict[str, Any]] = None,
2620
- retries: int = 3,
2621
- backoff_factor: float = 0.5) -> Dict[str, Any]:
2622
- """
2623
- Generic async request method with exponential backoff retry logic.
2624
-
2625
- Args:
2626
- method (str): The HTTP method to use (e.g., "GET" or "POST").
2627
- url (str): The URL to send the request to.
2628
- headers (Dict[str, str]): Headers to include in the request.
2629
- data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2630
- retries (int): Maximum number of retry attempts (default: 3).
2631
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2632
- Delay will be backoff_factor * (2 ** retry_count).
2633
-
2634
- Returns:
2635
- Dict[str, Any]: The parsed JSON response from the server.
2636
-
2637
- Raises:
2638
- aiohttp.ClientError: If the request fails after all retries.
2639
- Exception: If max retries are exceeded or other errors occur.
2640
- """
2641
- async with aiohttp.ClientSession() as session:
2642
- for attempt in range(retries):
2643
- try:
2644
- async with session.request(
2645
- method=method, url=url, headers=headers, json=data
2646
- ) as response:
2647
- if response.status == 502:
2648
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2649
- continue
2650
- if response.status >= 300:
2651
- await self._handle_error(response, f"make {method} request")
2652
- return await response.json()
2653
- except aiohttp.ClientError as e:
2654
- if attempt == retries - 1:
2655
- raise e
2656
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2657
- raise Exception("Max retries exceeded")
2658
-
2659
- async def _async_post_request(
2660
- self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2661
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2662
- """
2663
- Make an async POST request with exponential backoff retry logic.
2664
-
2665
- Args:
2666
- url (str): The URL to send the POST request to.
2667
- data (Dict[str, Any]): The JSON data to include in the request body.
2668
- headers (Dict[str, str]): Headers to include in the request.
2669
- retries (int): Maximum number of retry attempts (default: 3).
2670
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2671
- Delay will be backoff_factor * (2 ** retry_count).
2672
-
2673
- Returns:
2674
- Dict[str, Any]: The parsed JSON response from the server.
2675
-
2676
- Raises:
2677
- aiohttp.ClientError: If the request fails after all retries.
2678
- Exception: If max retries are exceeded or other errors occur.
2679
- """
2680
- return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2681
-
2682
- async def _async_get_request(
2683
- self, url: str, headers: Dict[str, str],
2684
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2685
- """
2686
- Make an async GET request with exponential backoff retry logic.
2687
-
2688
- Args:
2689
- url (str): The URL to send the GET request to.
2690
- headers (Dict[str, str]): Headers to include in the request.
2691
- retries (int): Maximum number of retry attempts (default: 3).
2692
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2693
- Delay will be backoff_factor * (2 ** retry_count).
2694
-
2695
- Returns:
2696
- Dict[str, Any]: The parsed JSON response from the server.
2697
-
2698
- Raises:
2699
- aiohttp.ClientError: If the request fails after all retries.
2700
- Exception: If max retries are exceeded or other errors occur.
2701
- """
2702
- return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2703
-
2704
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2705
- """
2706
- Handle errors from async API responses with detailed error messages.
2707
-
2708
- Args:
2709
- response (aiohttp.ClientResponse): The response object from the failed request
2710
- action (str): Description of the action that was being attempted
2711
-
2712
- Raises:
2713
- aiohttp.ClientError: With a detailed error message based on the response status:
2714
- - 402: Payment Required
2715
- - 408: Request Timeout
2716
- - 409: Conflict
2717
- - 500: Internal Server Error
2718
- - Other: Unexpected error with status code
2719
- """
2720
- try:
2721
- error_data = await response.json()
2722
- error_message = error_data.get('error', 'No error message provided.')
2723
- error_details = error_data.get('details', 'No additional error details provided.')
2724
- except:
2725
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2726
-
2727
- message = await self._get_async_error_message(response.status, action, error_message, error_details)
2728
-
2729
- raise aiohttp.ClientError(message)
2730
-
2731
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2732
- """
2733
- Generate a standardized error message based on HTTP status code for async operations.
2734
-
2735
- Args:
2736
- status_code (int): The HTTP status code from the response
2737
- action (str): Description of the action that was being performed
2738
- error_message (str): The error message from the API response
2739
- error_details (str): Additional error details from the API response
2740
-
2741
- Returns:
2742
- str: A formatted error message
2743
- """
2744
- return self._get_error_message(status_code, action, error_message, error_details)
2745
-
2746
- async def crawl_url_and_watch(
2747
- self,
2748
- url: str,
2749
- params: Optional[CrawlParams] = None,
2750
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2751
- """
2752
- Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2753
-
2754
- Args:
2755
- url (str): Target URL to start crawling from
2756
- params (Optional[CrawlParams]): See CrawlParams model for configuration:
2757
- URL Discovery:
2758
- * includePaths - Patterns of URLs to include
2759
- * excludePaths - Patterns of URLs to exclude
2760
- * maxDepth - Maximum crawl depth
2761
- * maxDiscoveryDepth - Maximum depth for finding new URLs
2762
- * limit - Maximum pages to crawl
2763
-
2764
- Link Following:
2765
- * allowBackwardLinks - Follow parent directory links
2766
- * allowExternalLinks - Follow external domain links
2767
- * ignoreSitemap - Skip sitemap.xml processing
2768
-
2769
- Advanced:
2770
- * scrapeOptions - Page scraping configuration
2771
- * webhook - Notification webhook settings
2772
- * deduplicateSimilarURLs - Remove similar URLs
2773
- * ignoreQueryParameters - Ignore URL parameters
2774
- * regexOnFullURL - Apply regex to full URLs
2775
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2776
-
2777
- Returns:
2778
- AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2779
-
2780
- Raises:
2781
- Exception: If crawl job fails to start
2782
- """
2783
- crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2784
- if crawl_response.get('success') and 'id' in crawl_response:
2785
- return AsyncCrawlWatcher(crawl_response['id'], self)
2786
- else:
2787
- raise Exception("Crawl job failed to start")
2788
-
2789
- async def batch_scrape_urls_and_watch(
2790
- self,
2791
- urls: List[str],
2792
- params: Optional[ScrapeParams] = None,
2793
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2794
- """
2795
- Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2796
-
2797
- Args:
2798
- urls (List[str]): List of URLs to scrape
2799
- params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2800
-
2801
- Content Options:
2802
- * formats - Content formats to retrieve
2803
- * includeTags - HTML tags to include
2804
- * excludeTags - HTML tags to exclude
2805
- * onlyMainContent - Extract main content only
2806
-
2807
- Request Options:
2808
- * headers - Custom HTTP headers
2809
- * timeout - Request timeout (ms)
2810
- * mobile - Use mobile user agent
2811
- * proxy - Proxy type
2812
-
2813
- Extraction Options:
2814
- * extract - Content extraction config
2815
- * jsonOptions - JSON extraction config
2816
- * actions - Actions to perform
2817
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2818
-
2819
- Returns:
2820
- AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2821
-
2822
- Raises:
2823
- Exception: If batch scrape job fails to start
2824
- """
2825
- batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2826
- if batch_response.get('success') and 'id' in batch_response:
2827
- return AsyncCrawlWatcher(batch_response['id'], self)
2828
- else:
2829
- raise Exception("Batch scrape job failed to start")
2830
-
2831
- async def scrape_url(
2832
- self,
2833
- url: str,
2834
- *,
2835
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2836
- include_tags: Optional[List[str]] = None,
2837
- exclude_tags: Optional[List[str]] = None,
2838
- only_main_content: Optional[bool] = None,
2839
- wait_for: Optional[int] = None,
2840
- timeout: Optional[int] = None,
2841
- location: Optional[LocationConfig] = None,
2842
- mobile: Optional[bool] = None,
2843
- skip_tls_verification: Optional[bool] = None,
2844
- remove_base64_images: Optional[bool] = None,
2845
- block_ads: Optional[bool] = None,
2846
- proxy: Optional[Literal["basic", "stealth"]] = None,
2847
- extract: Optional[JsonConfig] = None,
2848
- json_options: Optional[JsonConfig] = None,
2849
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2850
- **kwargs) -> ScrapeResponse[Any]:
2851
- """
2852
- Scrape a single URL asynchronously.
2853
-
2854
- Args:
2855
- url (str): Target URL to scrape
2856
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2857
- include_tags (Optional[List[str]]): HTML tags to include
2858
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2859
- only_main_content (Optional[bool]): Extract main content only
2860
- wait_for (Optional[int]): Wait for a specific element to appear
2861
- timeout (Optional[int]): Request timeout (ms)
2862
- location (Optional[LocationConfig]): Location configuration
2863
- mobile (Optional[bool]): Use mobile user agent
2864
- skip_tls_verification (Optional[bool]): Skip TLS verification
2865
- remove_base64_images (Optional[bool]): Remove base64 images
2866
- block_ads (Optional[bool]): Block ads
2867
- proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2868
- extract (Optional[JsonConfig]): Content extraction settings
2869
- json_options (Optional[JsonConfig]): JSON extraction settings
2870
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2871
- **kwargs: Additional parameters to pass to the API
2872
-
2873
- Returns:
2874
- ScrapeResponse with:
2875
- * success - Whether scrape was successful
2876
- * markdown - Markdown content if requested
2877
- * html - HTML content if requested
2878
- * rawHtml - Raw HTML content if requested
2879
- * links - Extracted links if requested
2880
- * screenshot - Screenshot if requested
2881
- * extract - Extracted data if requested
2882
- * json - JSON data if requested
2883
- * error - Error message if scrape failed
2884
-
2885
- Raises:
2886
- Exception: If scraping fails
2887
- """
2888
- # Validate any additional kwargs
2889
- self._validate_kwargs(kwargs, "scrape_url")
2890
-
2891
- headers = self._prepare_headers()
2892
-
2893
- # Build scrape parameters
2894
- scrape_params = {
2895
- 'url': url,
2896
- 'origin': f"python-sdk@{version}"
2897
- }
2898
-
2899
- # Add optional parameters if provided and not None
2900
- if formats:
2901
- scrape_params['formats'] = formats
2902
- if include_tags:
2903
- scrape_params['includeTags'] = include_tags
2904
- if exclude_tags:
2905
- scrape_params['excludeTags'] = exclude_tags
2906
- if only_main_content is not None:
2907
- scrape_params['onlyMainContent'] = only_main_content
2908
- if wait_for:
2909
- scrape_params['waitFor'] = wait_for
2910
- if timeout:
2911
- scrape_params['timeout'] = timeout
2912
- if location:
2913
- scrape_params['location'] = location.dict(exclude_none=True)
2914
- if mobile is not None:
2915
- scrape_params['mobile'] = mobile
2916
- if skip_tls_verification is not None:
2917
- scrape_params['skipTlsVerification'] = skip_tls_verification
2918
- if remove_base64_images is not None:
2919
- scrape_params['removeBase64Images'] = remove_base64_images
2920
- if block_ads is not None:
2921
- scrape_params['blockAds'] = block_ads
2922
- if proxy:
2923
- scrape_params['proxy'] = proxy
2924
- if extract is not None:
2925
- extract = self._ensure_schema_dict(extract)
2926
- if isinstance(extract, dict) and "schema" in extract:
2927
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
2928
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
2929
- if json_options is not None:
2930
- json_options = self._ensure_schema_dict(json_options)
2931
- if isinstance(json_options, dict) and "schema" in json_options:
2932
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
2933
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
2934
- if actions:
2935
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
2936
-
2937
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
2938
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
2939
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
2940
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
2941
-
2942
- # Make async request
2943
- endpoint = f'/v1/scrape'
2944
- response = await self._async_post_request(
2945
- f'{self.api_url}{endpoint}',
2946
- scrape_params,
2947
- headers
2948
- )
2949
-
2950
- if response.get('success') and 'data' in response:
2951
- return ScrapeResponse(**response['data'])
2952
- elif "error" in response:
2953
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2954
- else:
2955
- # Use the response content directly if possible, otherwise a generic message
2956
- error_content = response.get('error', str(response))
2957
- raise Exception(f'Failed to scrape URL. Error: {error_content}')
2958
-
2959
- async def batch_scrape_urls(
2960
- self,
2961
- urls: List[str],
2962
- *,
2963
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2964
- headers: Optional[Dict[str, str]] = None,
2965
- include_tags: Optional[List[str]] = None,
2966
- exclude_tags: Optional[List[str]] = None,
2967
- only_main_content: Optional[bool] = None,
2968
- wait_for: Optional[int] = None,
2969
- timeout: Optional[int] = None,
2970
- location: Optional[LocationConfig] = None,
2971
- mobile: Optional[bool] = None,
2972
- skip_tls_verification: Optional[bool] = None,
2973
- remove_base64_images: Optional[bool] = None,
2974
- block_ads: Optional[bool] = None,
2975
- proxy: Optional[Literal["basic", "stealth"]] = None,
2976
- extract: Optional[JsonConfig] = None,
2977
- json_options: Optional[JsonConfig] = None,
2978
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2979
- agent: Optional[AgentOptions] = None,
2980
- poll_interval: Optional[int] = 2,
2981
- idempotency_key: Optional[str] = None,
2982
- **kwargs
2983
- ) -> BatchScrapeStatusResponse:
2984
- """
2985
- Asynchronously scrape multiple URLs and monitor until completion.
2986
-
2987
- Args:
2988
- urls (List[str]): URLs to scrape
2989
- formats (Optional[List[Literal]]): Content formats to retrieve
2990
- headers (Optional[Dict[str, str]]): Custom HTTP headers
2991
- include_tags (Optional[List[str]]): HTML tags to include
2992
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2993
- only_main_content (Optional[bool]): Extract main content only
2994
- wait_for (Optional[int]): Wait time in milliseconds
2995
- timeout (Optional[int]): Request timeout in milliseconds
2996
- location (Optional[LocationConfig]): Location configuration
2997
- mobile (Optional[bool]): Use mobile user agent
2998
- skip_tls_verification (Optional[bool]): Skip TLS verification
2999
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3000
- block_ads (Optional[bool]): Block advertisements
3001
- proxy (Optional[Literal]): Proxy type to use
3002
- extract (Optional[JsonConfig]): Content extraction config
3003
- json_options (Optional[JsonConfig]): JSON extraction config
3004
- actions (Optional[List[Union]]): Actions to perform
3005
- agent (Optional[AgentOptions]): Agent configuration
3006
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3007
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3008
- **kwargs: Additional parameters to pass to the API
3009
-
3010
- Returns:
3011
- BatchScrapeStatusResponse with:
3012
- * Scraping status and progress
3013
- * Scraped content for each URL
3014
- * Success/error information
3015
-
3016
- Raises:
3017
- Exception: If batch scrape fails
3018
- """
3019
- # Validate any additional kwargs
3020
- self._validate_kwargs(kwargs, "batch_scrape_urls")
3021
-
3022
- scrape_params = {}
3023
-
3024
- # Add individual parameters
3025
- if formats is not None:
3026
- scrape_params['formats'] = formats
3027
- if headers is not None:
3028
- scrape_params['headers'] = headers
3029
- if include_tags is not None:
3030
- scrape_params['includeTags'] = include_tags
3031
- if exclude_tags is not None:
3032
- scrape_params['excludeTags'] = exclude_tags
3033
- if only_main_content is not None:
3034
- scrape_params['onlyMainContent'] = only_main_content
3035
- if wait_for is not None:
3036
- scrape_params['waitFor'] = wait_for
3037
- if timeout is not None:
3038
- scrape_params['timeout'] = timeout
3039
- if location is not None:
3040
- scrape_params['location'] = location.dict(exclude_none=True)
3041
- if mobile is not None:
3042
- scrape_params['mobile'] = mobile
3043
- if skip_tls_verification is not None:
3044
- scrape_params['skipTlsVerification'] = skip_tls_verification
3045
- if remove_base64_images is not None:
3046
- scrape_params['removeBase64Images'] = remove_base64_images
3047
- if block_ads is not None:
3048
- scrape_params['blockAds'] = block_ads
3049
- if proxy is not None:
3050
- scrape_params['proxy'] = proxy
3051
- if extract is not None:
3052
- extract = self._ensure_schema_dict(extract)
3053
- if isinstance(extract, dict) and "schema" in extract:
3054
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3055
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3056
- if json_options is not None:
3057
- json_options = self._ensure_schema_dict(json_options)
3058
- if isinstance(json_options, dict) and "schema" in json_options:
3059
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3060
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3061
- if actions is not None:
3062
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3063
- if agent is not None:
3064
- scrape_params['agent'] = agent.dict(exclude_none=True)
3065
-
3066
- # Add any additional kwargs
3067
- scrape_params.update(kwargs)
3068
-
3069
- # Create final params object
3070
- final_params = ScrapeParams(**scrape_params)
3071
- params_dict = final_params.dict(exclude_none=True)
3072
- params_dict['urls'] = urls
3073
- params_dict['origin'] = f"python-sdk@{version}"
3074
-
3075
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3076
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3077
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3078
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3079
-
3080
- # Make request
3081
- headers = self._prepare_headers(idempotency_key)
3082
- response = await self._async_post_request(
3083
- f'{self.api_url}/v1/batch/scrape',
3084
- params_dict,
3085
- headers
3086
- )
3087
-
3088
- if response.get('success'):
3089
- try:
3090
- id = response.get('id')
3091
- except:
3092
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3093
- return await self._async_monitor_job_status(id, headers, poll_interval)
3094
- else:
3095
- self._handle_error(response, 'start batch scrape job')
3096
-
3097
-
3098
- async def async_batch_scrape_urls(
3099
- self,
3100
- urls: List[str],
3101
- *,
3102
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3103
- headers: Optional[Dict[str, str]] = None,
3104
- include_tags: Optional[List[str]] = None,
3105
- exclude_tags: Optional[List[str]] = None,
3106
- only_main_content: Optional[bool] = None,
3107
- wait_for: Optional[int] = None,
3108
- timeout: Optional[int] = None,
3109
- location: Optional[LocationConfig] = None,
3110
- mobile: Optional[bool] = None,
3111
- skip_tls_verification: Optional[bool] = None,
3112
- remove_base64_images: Optional[bool] = None,
3113
- block_ads: Optional[bool] = None,
3114
- proxy: Optional[Literal["basic", "stealth"]] = None,
3115
- extract: Optional[JsonConfig] = None,
3116
- json_options: Optional[JsonConfig] = None,
3117
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3118
- agent: Optional[AgentOptions] = None,
3119
- idempotency_key: Optional[str] = None,
3120
- **kwargs
3121
- ) -> BatchScrapeResponse:
3122
- """
3123
- Initiate a batch scrape job asynchronously.
3124
-
3125
- Args:
3126
- urls (List[str]): URLs to scrape
3127
- formats (Optional[List[Literal]]): Content formats to retrieve
3128
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3129
- include_tags (Optional[List[str]]): HTML tags to include
3130
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3131
- only_main_content (Optional[bool]): Extract main content only
3132
- wait_for (Optional[int]): Wait time in milliseconds
3133
- timeout (Optional[int]): Request timeout in milliseconds
3134
- location (Optional[LocationConfig]): Location configuration
3135
- mobile (Optional[bool]): Use mobile user agent
3136
- skip_tls_verification (Optional[bool]): Skip TLS verification
3137
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3138
- block_ads (Optional[bool]): Block advertisements
3139
- proxy (Optional[Literal]): Proxy type to use
3140
- extract (Optional[JsonConfig]): Content extraction config
3141
- json_options (Optional[JsonConfig]): JSON extraction config
3142
- actions (Optional[List[Union]]): Actions to perform
3143
- agent (Optional[AgentOptions]): Agent configuration
3144
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3145
- **kwargs: Additional parameters to pass to the API
3146
-
3147
- Returns:
3148
- BatchScrapeResponse with:
3149
- * success - Whether job started successfully
3150
- * id - Unique identifier for the job
3151
- * url - Status check URL
3152
- * error - Error message if start failed
3153
-
3154
- Raises:
3155
- Exception: If job initiation fails
3156
- """
3157
- # Validate any additional kwargs
3158
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3159
-
3160
- scrape_params = {}
3161
-
3162
- # Add individual parameters
3163
- if formats is not None:
3164
- scrape_params['formats'] = formats
3165
- if headers is not None:
3166
- scrape_params['headers'] = headers
3167
- if include_tags is not None:
3168
- scrape_params['includeTags'] = include_tags
3169
- if exclude_tags is not None:
3170
- scrape_params['excludeTags'] = exclude_tags
3171
- if only_main_content is not None:
3172
- scrape_params['onlyMainContent'] = only_main_content
3173
- if wait_for is not None:
3174
- scrape_params['waitFor'] = wait_for
3175
- if timeout is not None:
3176
- scrape_params['timeout'] = timeout
3177
- if location is not None:
3178
- scrape_params['location'] = location.dict(exclude_none=True)
3179
- if mobile is not None:
3180
- scrape_params['mobile'] = mobile
3181
- if skip_tls_verification is not None:
3182
- scrape_params['skipTlsVerification'] = skip_tls_verification
3183
- if remove_base64_images is not None:
3184
- scrape_params['removeBase64Images'] = remove_base64_images
3185
- if block_ads is not None:
3186
- scrape_params['blockAds'] = block_ads
3187
- if proxy is not None:
3188
- scrape_params['proxy'] = proxy
3189
- if extract is not None:
3190
- extract = self._ensure_schema_dict(extract)
3191
- if isinstance(extract, dict) and "schema" in extract:
3192
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3193
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3194
- if json_options is not None:
3195
- json_options = self._ensure_schema_dict(json_options)
3196
- if isinstance(json_options, dict) and "schema" in json_options:
3197
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3198
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3199
- if actions is not None:
3200
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3201
- if agent is not None:
3202
- scrape_params['agent'] = agent.dict(exclude_none=True)
3203
-
3204
- # Add any additional kwargs
3205
- scrape_params.update(kwargs)
3206
-
3207
- # Create final params object
3208
- final_params = ScrapeParams(**scrape_params)
3209
- params_dict = final_params.dict(exclude_none=True)
3210
- params_dict['urls'] = urls
3211
- params_dict['origin'] = f"python-sdk@{version}"
3212
-
3213
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3214
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3215
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3216
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3217
-
3218
- # Make request
3219
- headers = self._prepare_headers(idempotency_key)
3220
- response = await self._async_post_request(
3221
- f'{self.api_url}/v1/batch/scrape',
3222
- params_dict,
3223
- headers
3224
- )
3225
-
3226
- if response.get('status_code') == 200:
3227
- try:
3228
- return BatchScrapeResponse(**response.json())
3229
- except:
3230
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3231
- else:
3232
- self._handle_error(response, 'start batch scrape job')
3233
-
3234
- async def crawl_url(
3235
- self,
3236
- url: str,
3237
- *,
3238
- include_paths: Optional[List[str]] = None,
3239
- exclude_paths: Optional[List[str]] = None,
3240
- max_depth: Optional[int] = None,
3241
- max_discovery_depth: Optional[int] = None,
3242
- limit: Optional[int] = None,
3243
- allow_backward_links: Optional[bool] = None,
3244
- allow_external_links: Optional[bool] = None,
3245
- ignore_sitemap: Optional[bool] = None,
3246
- scrape_options: Optional[ScrapeOptions] = None,
3247
- webhook: Optional[Union[str, WebhookConfig]] = None,
3248
- deduplicate_similar_urls: Optional[bool] = None,
3249
- ignore_query_parameters: Optional[bool] = None,
3250
- regex_on_full_url: Optional[bool] = None,
3251
- delay: Optional[int] = None,
3252
- poll_interval: Optional[int] = 2,
3253
- idempotency_key: Optional[str] = None,
3254
- **kwargs
3255
- ) -> CrawlStatusResponse:
3256
- """
3257
- Crawl a website starting from a URL.
3258
-
3259
- Args:
3260
- url (str): Target URL to start crawling from
3261
- include_paths (Optional[List[str]]): Patterns of URLs to include
3262
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3263
- max_depth (Optional[int]): Maximum crawl depth
3264
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3265
- limit (Optional[int]): Maximum pages to crawl
3266
- allow_backward_links (Optional[bool]): Follow parent directory links
3267
- allow_external_links (Optional[bool]): Follow external domain links
3268
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3269
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3270
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3271
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3272
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3273
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3274
- delay (Optional[int]): Delay in seconds between scrapes
3275
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3276
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3277
- **kwargs: Additional parameters to pass to the API
3278
-
3279
- Returns:
3280
- CrawlStatusResponse with:
3281
- * Crawling status and progress
3282
- * Crawled page contents
3283
- * Success/error information
3284
-
3285
- Raises:
3286
- Exception: If crawl fails
3287
- """
3288
- # Validate any additional kwargs
3289
- self._validate_kwargs(kwargs, "crawl_url")
3290
-
3291
- crawl_params = {}
3292
-
3293
- # Add individual parameters
3294
- if include_paths is not None:
3295
- crawl_params['includePaths'] = include_paths
3296
- if exclude_paths is not None:
3297
- crawl_params['excludePaths'] = exclude_paths
3298
- if max_depth is not None:
3299
- crawl_params['maxDepth'] = max_depth
3300
- if max_discovery_depth is not None:
3301
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3302
- if limit is not None:
3303
- crawl_params['limit'] = limit
3304
- if allow_backward_links is not None:
3305
- crawl_params['allowBackwardLinks'] = allow_backward_links
3306
- if allow_external_links is not None:
3307
- crawl_params['allowExternalLinks'] = allow_external_links
3308
- if ignore_sitemap is not None:
3309
- crawl_params['ignoreSitemap'] = ignore_sitemap
3310
- if scrape_options is not None:
3311
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3312
- if webhook is not None:
3313
- crawl_params['webhook'] = webhook
3314
- if deduplicate_similar_urls is not None:
3315
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3316
- if ignore_query_parameters is not None:
3317
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3318
- if regex_on_full_url is not None:
3319
- crawl_params['regexOnFullURL'] = regex_on_full_url
3320
- if delay is not None:
3321
- crawl_params['delay'] = delay
3322
-
3323
- # Add any additional kwargs
3324
- crawl_params.update(kwargs)
3325
-
3326
- # Create final params object
3327
- final_params = CrawlParams(**crawl_params)
3328
- params_dict = final_params.dict(exclude_none=True)
3329
- params_dict['url'] = url
3330
- params_dict['origin'] = f"python-sdk@{version}"
3331
- # Make request
3332
- headers = self._prepare_headers(idempotency_key)
3333
- response = await self._async_post_request(
3334
- f'{self.api_url}/v1/crawl', params_dict, headers)
3335
-
3336
- if response.get('success'):
3337
- try:
3338
- id = response.get('id')
3339
- except:
3340
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3341
- return await self._async_monitor_job_status(id, headers, poll_interval)
3342
- else:
3343
- self._handle_error(response, 'start crawl job')
3344
-
3345
-
3346
- async def async_crawl_url(
3347
- self,
3348
- url: str,
3349
- *,
3350
- include_paths: Optional[List[str]] = None,
3351
- exclude_paths: Optional[List[str]] = None,
3352
- max_depth: Optional[int] = None,
3353
- max_discovery_depth: Optional[int] = None,
3354
- limit: Optional[int] = None,
3355
- allow_backward_links: Optional[bool] = None,
3356
- allow_external_links: Optional[bool] = None,
3357
- ignore_sitemap: Optional[bool] = None,
3358
- scrape_options: Optional[ScrapeOptions] = None,
3359
- webhook: Optional[Union[str, WebhookConfig]] = None,
3360
- deduplicate_similar_urls: Optional[bool] = None,
3361
- ignore_query_parameters: Optional[bool] = None,
3362
- regex_on_full_url: Optional[bool] = None,
3363
- delay: Optional[int] = None,
3364
- poll_interval: Optional[int] = 2,
3365
- idempotency_key: Optional[str] = None,
3366
- **kwargs
3367
- ) -> CrawlResponse:
3368
- """
3369
- Start an asynchronous crawl job.
3370
-
3371
- Args:
3372
- url (str): Target URL to start crawling from
3373
- include_paths (Optional[List[str]]): Patterns of URLs to include
3374
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3375
- max_depth (Optional[int]): Maximum crawl depth
3376
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3377
- limit (Optional[int]): Maximum pages to crawl
3378
- allow_backward_links (Optional[bool]): Follow parent directory links
3379
- allow_external_links (Optional[bool]): Follow external domain links
3380
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3381
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3382
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3383
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3384
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3385
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3386
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3387
- **kwargs: Additional parameters to pass to the API
3388
-
3389
- Returns:
3390
- CrawlResponse with:
3391
- * success - Whether crawl started successfully
3392
- * id - Unique identifier for the crawl job
3393
- * url - Status check URL for the crawl
3394
- * error - Error message if start failed
3395
-
3396
- Raises:
3397
- Exception: If crawl initiation fails
3398
- """
3399
- crawl_params = {}
3400
-
3401
- # Add individual parameters
3402
- if include_paths is not None:
3403
- crawl_params['includePaths'] = include_paths
3404
- if exclude_paths is not None:
3405
- crawl_params['excludePaths'] = exclude_paths
3406
- if max_depth is not None:
3407
- crawl_params['maxDepth'] = max_depth
3408
- if max_discovery_depth is not None:
3409
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3410
- if limit is not None:
3411
- crawl_params['limit'] = limit
3412
- if allow_backward_links is not None:
3413
- crawl_params['allowBackwardLinks'] = allow_backward_links
3414
- if allow_external_links is not None:
3415
- crawl_params['allowExternalLinks'] = allow_external_links
3416
- if ignore_sitemap is not None:
3417
- crawl_params['ignoreSitemap'] = ignore_sitemap
3418
- if scrape_options is not None:
3419
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3420
- if webhook is not None:
3421
- crawl_params['webhook'] = webhook
3422
- if deduplicate_similar_urls is not None:
3423
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3424
- if ignore_query_parameters is not None:
3425
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3426
- if regex_on_full_url is not None:
3427
- crawl_params['regexOnFullURL'] = regex_on_full_url
3428
- if delay is not None:
3429
- crawl_params['delay'] = delay
3430
-
3431
- # Add any additional kwargs
3432
- crawl_params.update(kwargs)
3433
-
3434
- # Create final params object
3435
- final_params = CrawlParams(**crawl_params)
3436
- params_dict = final_params.dict(exclude_none=True)
3437
- params_dict['url'] = url
3438
- params_dict['origin'] = f"python-sdk@{version}"
3439
-
3440
- # Make request
3441
- headers = self._prepare_headers(idempotency_key)
3442
- response = await self._async_post_request(
3443
- f'{self.api_url}/v1/crawl',
3444
- params_dict,
3445
- headers
3446
- )
3447
-
3448
- if response.get('success'):
3449
- try:
3450
- return CrawlResponse(**response)
3451
- except:
3452
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3453
- else:
3454
- self._handle_error(response, 'start crawl job')
3455
-
3456
- async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3457
- """
3458
- Check the status and results of an asynchronous crawl job.
3459
-
3460
- Args:
3461
- id (str): Unique identifier for the crawl job
3462
-
3463
- Returns:
3464
- CrawlStatusResponse containing:
3465
- Status Information:
3466
- * status - Current state (scraping/completed/failed/cancelled)
3467
- * completed - Number of pages crawled
3468
- * total - Total pages to crawl
3469
- * creditsUsed - API credits consumed
3470
- * expiresAt - Data expiration timestamp
3471
-
3472
- Results:
3473
- * data - List of crawled documents
3474
- * next - URL for next page of results (if paginated)
3475
- * success - Whether status check succeeded
3476
- * error - Error message if failed
3477
-
3478
- Raises:
3479
- Exception: If status check fails
3480
- """
3481
- headers = self._prepare_headers()
3482
- endpoint = f'/v1/crawl/{id}'
3483
-
3484
- status_data = await self._async_get_request(
3485
- f'{self.api_url}{endpoint}',
3486
- headers
3487
- )
3488
-
3489
- if status_data.get('status') == 'completed':
3490
- if 'data' in status_data:
3491
- data = status_data['data']
3492
- while 'next' in status_data:
3493
- if len(status_data['data']) == 0:
3494
- break
3495
- next_url = status_data.get('next')
3496
- if not next_url:
3497
- logger.warning("Expected 'next' URL is missing.")
3498
- break
3499
- next_data = await self._async_get_request(next_url, headers)
3500
- data.extend(next_data.get('data', []))
3501
- status_data = next_data
3502
- status_data['data'] = data
3503
- # Create CrawlStatusResponse object from status data
3504
- response = CrawlStatusResponse(
3505
- status=status_data.get('status'),
3506
- total=status_data.get('total'),
3507
- completed=status_data.get('completed'),
3508
- creditsUsed=status_data.get('creditsUsed'),
3509
- expiresAt=status_data.get('expiresAt'),
3510
- data=status_data.get('data'),
3511
- success=False if 'error' in status_data else True
3512
- )
3513
-
3514
- if 'error' in status_data:
3515
- response.error = status_data.get('error')
3516
-
3517
- if 'next' in status_data:
3518
- response.next = status_data.get('next')
3519
-
3520
- return response
3521
-
3522
- async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3523
- """
3524
- Monitor the status of an asynchronous job until completion.
3525
-
3526
- Args:
3527
- id (str): The ID of the job to monitor
3528
- headers (Dict[str, str]): Headers to include in status check requests
3529
- poll_interval (int): Seconds between status checks (default: 2)
3530
-
3531
- Returns:
3532
- CrawlStatusResponse: The job results if completed successfully
3533
-
3534
- Raises:
3535
- Exception: If the job fails or an error occurs during status checks
3536
- """
3537
- while True:
3538
- status_data = await self._async_get_request(
3539
- f'{self.api_url}/v1/crawl/{id}',
3540
- headers
3541
- )
3542
-
3543
- if status_data.get('status') == 'completed':
3544
- if 'data' in status_data:
3545
- data = status_data['data']
3546
- while 'next' in status_data:
3547
- if len(status_data['data']) == 0:
3548
- break
3549
- next_url = status_data.get('next')
3550
- if not next_url:
3551
- logger.warning("Expected 'next' URL is missing.")
3552
- break
3553
- next_data = await self._async_get_request(next_url, headers)
3554
- data.extend(next_data.get('data', []))
3555
- status_data = next_data
3556
- status_data['data'] = data
3557
- return CrawlStatusResponse(**status_data)
3558
- else:
3559
- raise Exception('Job completed but no data was returned')
3560
- elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3561
- await asyncio.sleep(max(poll_interval, 2))
3562
- else:
3563
- raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3564
-
3565
- async def map_url(
3566
- self,
3567
- url: str,
3568
- *,
3569
- search: Optional[str] = None,
3570
- ignore_sitemap: Optional[bool] = None,
3571
- include_subdomains: Optional[bool] = None,
3572
- sitemap_only: Optional[bool] = None,
3573
- limit: Optional[int] = None,
3574
- timeout: Optional[int] = None,
3575
- params: Optional[MapParams] = None) -> MapResponse:
3576
- """
3577
- Asynchronously map and discover links from a URL.
3578
-
3579
- Args:
3580
- url (str): Target URL to map
3581
- params (Optional[MapParams]): See MapParams model:
3582
- Discovery Options:
3583
- * search - Filter pattern for URLs
3584
- * ignoreSitemap - Skip sitemap.xml
3585
- * includeSubdomains - Include subdomain links
3586
- * sitemapOnly - Only use sitemap.xml
3587
-
3588
- Limits:
3589
- * limit - Max URLs to return
3590
- * timeout - Request timeout (ms)
3591
-
3592
- Returns:
3593
- MapResponse with:
3594
- * Discovered URLs
3595
- * Success/error status
3596
-
3597
- Raises:
3598
- Exception: If mapping fails
3599
- """
3600
- map_params = {}
3601
- if params:
3602
- map_params.update(params.dict(exclude_none=True))
3603
-
3604
- # Add individual parameters
3605
- if search is not None:
3606
- map_params['search'] = search
3607
- if ignore_sitemap is not None:
3608
- map_params['ignoreSitemap'] = ignore_sitemap
3609
- if include_subdomains is not None:
3610
- map_params['includeSubdomains'] = include_subdomains
3611
- if sitemap_only is not None:
3612
- map_params['sitemapOnly'] = sitemap_only
3613
- if limit is not None:
3614
- map_params['limit'] = limit
3615
- if timeout is not None:
3616
- map_params['timeout'] = timeout
3617
-
3618
- # Create final params object
3619
- final_params = MapParams(**map_params)
3620
- params_dict = final_params.dict(exclude_none=True)
3621
- params_dict['url'] = url
3622
- params_dict['origin'] = f"python-sdk@{version}"
3623
-
3624
- # Make request
3625
- endpoint = f'/v1/map'
3626
- response = await self._async_post_request(
3627
- f'{self.api_url}{endpoint}',
3628
- params_dict,
3629
- headers={"Authorization": f"Bearer {self.api_key}"}
3630
- )
3631
-
3632
- if response.get('success') and 'links' in response:
3633
- return MapResponse(**response)
3634
- elif 'error' in response:
3635
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
3636
- else:
3637
- raise Exception(f'Failed to map URL. Error: {response}')
3638
-
3639
- async def extract(
3640
- self,
3641
- urls: Optional[List[str]] = None,
3642
- *,
3643
- prompt: Optional[str] = None,
3644
- schema: Optional[Any] = None,
3645
- system_prompt: Optional[str] = None,
3646
- allow_external_links: Optional[bool] = False,
3647
- enable_web_search: Optional[bool] = False,
3648
- show_sources: Optional[bool] = False,
3649
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3650
-
3651
- """
3652
- Asynchronously extract structured information from URLs.
3653
-
3654
- Args:
3655
- urls (Optional[List[str]]): URLs to extract from
3656
- prompt (Optional[str]): Custom extraction prompt
3657
- schema (Optional[Any]): JSON schema/Pydantic model
3658
- system_prompt (Optional[str]): System context
3659
- allow_external_links (Optional[bool]): Follow external links
3660
- enable_web_search (Optional[bool]): Enable web search
3661
- show_sources (Optional[bool]): Include source URLs
3662
- agent (Optional[Dict[str, Any]]): Agent configuration
3663
-
3664
- Returns:
3665
- ExtractResponse with:
3666
- * Structured data matching schema
3667
- * Source information if requested
3668
- * Success/error status
3669
-
3670
- Raises:
3671
- ValueError: If prompt/schema missing or extraction fails
3672
- """
3673
- headers = self._prepare_headers()
3674
-
3675
- if not prompt and not schema:
3676
- raise ValueError("Either prompt or schema is required")
3677
-
3678
- if not urls and not prompt:
3679
- raise ValueError("Either urls or prompt is required")
3680
-
3681
- if schema:
3682
- schema = self._ensure_schema_dict(schema)
3683
-
3684
- request_data = {
3685
- 'urls': urls or [],
3686
- 'allowExternalLinks': allow_external_links,
3687
- 'enableWebSearch': enable_web_search,
3688
- 'showSources': show_sources,
3689
- 'schema': schema,
3690
- 'origin': f'python-sdk@{get_version()}'
3691
- }
3692
-
3693
- # Only add prompt and systemPrompt if they exist
3694
- if prompt:
3695
- request_data['prompt'] = prompt
3696
- if system_prompt:
3697
- request_data['systemPrompt'] = system_prompt
3698
-
3699
- if agent:
3700
- request_data['agent'] = agent
3701
-
3702
- response = await self._async_post_request(
3703
- f'{self.api_url}/v1/extract',
3704
- request_data,
3705
- headers
3706
- )
3707
-
3708
- if response.get('success'):
3709
- job_id = response.get('id')
3710
- if not job_id:
3711
- raise Exception('Job ID not returned from extract request.')
3712
-
3713
- while True:
3714
- status_data = await self._async_get_request(
3715
- f'{self.api_url}/v1/extract/{job_id}',
3716
- headers
3717
- )
3718
-
3719
- if status_data['status'] == 'completed':
3720
- return ExtractResponse(**status_data)
3721
- elif status_data['status'] in ['failed', 'cancelled']:
3722
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3723
-
3724
- await asyncio.sleep(2)
3725
- else:
3726
- raise Exception(f'Failed to extract. Error: {response.get("error")}')
3727
-
3728
- async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3729
- """
3730
- Check the status of an asynchronous batch scrape job.
3731
-
3732
- Args:
3733
- id (str): The ID of the batch scrape job
3734
-
3735
- Returns:
3736
- BatchScrapeStatusResponse containing:
3737
- Status Information:
3738
- * status - Current state (scraping/completed/failed/cancelled)
3739
- * completed - Number of URLs scraped
3740
- * total - Total URLs to scrape
3741
- * creditsUsed - API credits consumed
3742
- * expiresAt - Data expiration timestamp
3743
-
3744
- Results:
3745
- * data - List of scraped documents
3746
- * next - URL for next page of results (if paginated)
3747
- * success - Whether status check succeeded
3748
- * error - Error message if failed
3749
-
3750
- Raises:
3751
- Exception: If status check fails
3752
- """
3753
- headers = self._prepare_headers()
3754
- endpoint = f'/v1/batch/scrape/{id}'
3755
-
3756
- status_data = await self._async_get_request(
3757
- f'{self.api_url}{endpoint}',
3758
- headers
3759
- )
3760
-
3761
- if status_data['status'] == 'completed':
3762
- if 'data' in status_data:
3763
- data = status_data['data']
3764
- while 'next' in status_data:
3765
- if len(status_data['data']) == 0:
3766
- break
3767
- next_url = status_data.get('next')
3768
- if not next_url:
3769
- logger.warning("Expected 'next' URL is missing.")
3770
- break
3771
- next_data = await self._async_get_request(next_url, headers)
3772
- data.extend(next_data.get('data', []))
3773
- status_data = next_data
3774
- status_data['data'] = data
3775
-
3776
- response = BatchScrapeStatusResponse(
3777
- status=status_data.get('status'),
3778
- total=status_data.get('total'),
3779
- completed=status_data.get('completed'),
3780
- creditsUsed=status_data.get('creditsUsed'),
3781
- expiresAt=status_data.get('expiresAt'),
3782
- data=status_data.get('data')
3783
- )
3784
-
3785
- if 'error' in status_data:
3786
- response['error'] = status_data['error']
3787
-
3788
- if 'next' in status_data:
3789
- response['next'] = status_data['next']
3790
-
3791
- return {
3792
- 'success': False if 'error' in status_data else True,
3793
- **response
3794
- }
3795
-
3796
- async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3797
- """
3798
- Get information about errors from an asynchronous batch scrape job.
3799
-
3800
- Args:
3801
- id (str): The ID of the batch scrape job
3802
-
3803
- Returns:
3804
- CrawlErrorsResponse containing:
3805
- errors (List[Dict[str, str]]): List of errors with fields:
3806
- * id (str): Error ID
3807
- * timestamp (str): When the error occurred
3808
- * url (str): URL that caused the error
3809
- * error (str): Error message
3810
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3811
-
3812
- Raises:
3813
- Exception: If error check fails
3814
- """
3815
- headers = self._prepare_headers()
3816
- return await self._async_get_request(
3817
- f'{self.api_url}/v1/batch/scrape/{id}/errors',
3818
- headers
3819
- )
3820
-
3821
- async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3822
- """
3823
- Get information about errors from an asynchronous crawl job.
3824
-
3825
- Args:
3826
- id (str): The ID of the crawl job
3827
-
3828
- Returns:
3829
- CrawlErrorsResponse containing:
3830
- * errors (List[Dict[str, str]]): List of errors with fields:
3831
- - id (str): Error ID
3832
- - timestamp (str): When the error occurred
3833
- - url (str): URL that caused the error
3834
- - error (str): Error message
3835
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3836
-
3837
- Raises:
3838
- Exception: If error check fails
3839
- """
3840
- headers = self._prepare_headers()
3841
- return await self._async_get_request(
3842
- f'{self.api_url}/v1/crawl/{id}/errors',
3843
- headers
3844
- )
3845
-
3846
- async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3847
- """
3848
- Cancel an asynchronous crawl job.
3849
-
3850
- Args:
3851
- id (str): The ID of the crawl job to cancel
3852
-
3853
- Returns:
3854
- Dict[str, Any] containing:
3855
- * success (bool): Whether cancellation was successful
3856
- * error (str, optional): Error message if cancellation failed
3857
-
3858
- Raises:
3859
- Exception: If cancellation fails
3860
- """
3861
- headers = self._prepare_headers()
3862
- async with aiohttp.ClientSession() as session:
3863
- async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3864
- return await response.json()
3865
-
3866
- async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3867
- """
3868
- Check the status of an asynchronous extraction job.
3869
-
3870
- Args:
3871
- job_id (str): The ID of the extraction job
3872
-
3873
- Returns:
3874
- ExtractResponse[Any] with:
3875
- * success (bool): Whether request succeeded
3876
- * data (Optional[Any]): Extracted data matching schema
3877
- * error (Optional[str]): Error message if any
3878
- * warning (Optional[str]): Warning message if any
3879
- * sources (Optional[List[str]]): Source URLs if requested
3880
-
3881
- Raises:
3882
- ValueError: If status check fails
3883
- """
3884
- headers = self._prepare_headers()
3885
- try:
3886
- return await self._async_get_request(
3887
- f'{self.api_url}/v1/extract/{job_id}',
3888
- headers
3889
- )
3890
- except Exception as e:
3891
- raise ValueError(str(e))
3892
-
3893
- async def async_extract(
3894
- self,
3895
- urls: Optional[List[str]] = None,
3896
- *,
3897
- prompt: Optional[str] = None,
3898
- schema: Optional[Any] = None,
3899
- system_prompt: Optional[str] = None,
3900
- allow_external_links: Optional[bool] = False,
3901
- enable_web_search: Optional[bool] = False,
3902
- show_sources: Optional[bool] = False,
3903
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3904
- """
3905
- Initiate an asynchronous extraction job without waiting for completion.
3906
-
3907
- Args:
3908
- urls (Optional[List[str]]): URLs to extract from
3909
- prompt (Optional[str]): Custom extraction prompt
3910
- schema (Optional[Any]): JSON schema/Pydantic model
3911
- system_prompt (Optional[str]): System context
3912
- allow_external_links (Optional[bool]): Follow external links
3913
- enable_web_search (Optional[bool]): Enable web search
3914
- show_sources (Optional[bool]): Include source URLs
3915
- agent (Optional[Dict[str, Any]]): Agent configuration
3916
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3917
-
3918
- Returns:
3919
- ExtractResponse[Any] with:
3920
- * success (bool): Whether request succeeded
3921
- * data (Optional[Any]): Extracted data matching schema
3922
- * error (Optional[str]): Error message if any
3923
-
3924
- Raises:
3925
- ValueError: If job initiation fails
3926
- """
3927
- headers = self._prepare_headers()
3928
-
3929
- if not prompt and not schema:
3930
- raise ValueError("Either prompt or schema is required")
3931
-
3932
- if not urls and not prompt:
3933
- raise ValueError("Either urls or prompt is required")
3934
-
3935
- if schema:
3936
- schema = self._ensure_schema_dict(schema)
3937
-
3938
- request_data = ExtractResponse(
3939
- urls=urls or [],
3940
- allowExternalLinks=allow_external_links,
3941
- enableWebSearch=enable_web_search,
3942
- showSources=show_sources,
3943
- schema=schema,
3944
- origin=f'python-sdk@{version}'
3945
- )
3946
-
3947
- if prompt:
3948
- request_data['prompt'] = prompt
3949
- if system_prompt:
3950
- request_data['systemPrompt'] = system_prompt
3951
- if agent:
3952
- request_data['agent'] = agent
3953
-
3954
- try:
3955
- return await self._async_post_request(
3956
- f'{self.api_url}/v1/extract',
3957
- request_data,
3958
- headers
3959
- )
3960
- except Exception as e:
3961
- raise ValueError(str(e))
3962
-
3963
- async def generate_llms_text(
3964
- self,
3965
- url: str,
3966
- *,
3967
- max_urls: Optional[int] = None,
3968
- show_full_text: Optional[bool] = None,
3969
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3970
- """
3971
- Generate LLMs.txt for a given URL and monitor until completion.
3972
-
3973
- Args:
3974
- url (str): Target URL to generate LLMs.txt from
3975
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
3976
- show_full_text (Optional[bool]): Include full text in output (default: False)
3977
- experimental_stream (Optional[bool]): Enable experimental streaming
3978
-
3979
- Returns:
3980
- GenerateLLMsTextStatusResponse containing:
3981
- * success (bool): Whether generation completed successfully
3982
- * status (str): Status of generation (processing/completed/failed)
3983
- * data (Dict[str, str], optional): Generated text with fields:
3984
- - llmstxt (str): Generated LLMs.txt content
3985
- - llmsfulltxt (str, optional): Full version if requested
3986
- * error (str, optional): Error message if generation failed
3987
- * expiresAt (str): When the generated data expires
3988
-
3989
- Raises:
3990
- Exception: If generation fails
3991
- """
3992
- params = {}
3993
- if max_urls is not None:
3994
- params['maxUrls'] = max_urls
3995
- if show_full_text is not None:
3996
- params['showFullText'] = show_full_text
3997
- if experimental_stream is not None:
3998
- params['__experimental_stream'] = experimental_stream
3999
-
4000
- response = await self.async_generate_llms_text(
4001
- url,
4002
- max_urls=max_urls,
4003
- show_full_text=show_full_text,
4004
- experimental_stream=experimental_stream
4005
- )
4006
- if not response.get('success') or 'id' not in response:
4007
- return response
4008
-
4009
- job_id = response['id']
4010
- while True:
4011
- status = await self.check_generate_llms_text_status(job_id)
4012
-
4013
- if status['status'] == 'completed':
4014
- return status
4015
- elif status['status'] == 'failed':
4016
- raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4017
- elif status['status'] != 'processing':
4018
- break
4019
-
4020
- await asyncio.sleep(2)
4021
-
4022
- return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4023
-
4024
- async def async_generate_llms_text(
4025
- self,
4026
- url: str,
4027
- *,
4028
- max_urls: Optional[int] = None,
4029
- show_full_text: Optional[bool] = None,
4030
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4031
- """
4032
- Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4033
-
4034
- Args:
4035
- url (str): Target URL to generate LLMs.txt from
4036
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
4037
- show_full_text (Optional[bool]): Include full text in output (default: False)
4038
- experimental_stream (Optional[bool]): Enable experimental streaming
4039
-
4040
- Returns:
4041
- GenerateLLMsTextResponse containing:
4042
- * success (bool): Whether job started successfully
4043
- * id (str): Unique identifier for the job
4044
- * error (str, optional): Error message if start failed
4045
-
4046
- Raises:
4047
- ValueError: If job initiation fails
4048
- """
4049
- params = {}
4050
- if max_urls is not None:
4051
- params['maxUrls'] = max_urls
4052
- if show_full_text is not None:
4053
- params['showFullText'] = show_full_text
4054
- if experimental_stream is not None:
4055
- params['__experimental_stream'] = experimental_stream
4056
-
4057
- params = GenerateLLMsTextParams(
4058
- maxUrls=max_urls,
4059
- showFullText=show_full_text,
4060
- __experimental_stream=experimental_stream
4061
- )
4062
-
4063
- headers = self._prepare_headers()
4064
- json_data = {'url': url, **params.dict(exclude_none=True)}
4065
- json_data['origin'] = f"python-sdk@{version}"
4066
-
4067
- try:
4068
- return await self._async_post_request(
4069
- f'{self.api_url}/v1/llmstxt',
4070
- json_data,
4071
- headers
4072
- )
4073
- except Exception as e:
4074
- raise ValueError(str(e))
4075
-
4076
- async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4077
- """
4078
- Check the status of an asynchronous LLMs.txt generation job.
4079
-
4080
- Args:
4081
- id (str): The ID of the generation job
4082
-
4083
- Returns:
4084
- GenerateLLMsTextStatusResponse containing:
4085
- * success (bool): Whether generation completed successfully
4086
- * status (str): Status of generation (processing/completed/failed)
4087
- * data (Dict[str, str], optional): Generated text with fields:
4088
- - llmstxt (str): Generated LLMs.txt content
4089
- - llmsfulltxt (str, optional): Full version if requested
4090
- * error (str, optional): Error message if generation failed
4091
- * expiresAt (str): When the generated data expires
4092
-
4093
- Raises:
4094
- ValueError: If status check fails
4095
- """
4096
- headers = self._prepare_headers()
4097
- try:
4098
- return await self._async_get_request(
4099
- f'{self.api_url}/v1/llmstxt/{id}',
4100
- headers
4101
- )
4102
- except Exception as e:
4103
- raise ValueError(str(e))
4104
-
4105
- async def deep_research(
4106
- self,
4107
- query: str,
4108
- *,
4109
- max_depth: Optional[int] = None,
4110
- time_limit: Optional[int] = None,
4111
- max_urls: Optional[int] = None,
4112
- analysis_prompt: Optional[str] = None,
4113
- system_prompt: Optional[str] = None,
4114
- __experimental_stream_steps: Optional[bool] = None,
4115
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4116
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4117
- """
4118
- Initiates a deep research operation on a given query and polls until completion.
4119
-
4120
- Args:
4121
- query (str): Research query or topic to investigate
4122
- max_depth (Optional[int]): Maximum depth of research exploration
4123
- time_limit (Optional[int]): Time limit in seconds for research
4124
- max_urls (Optional[int]): Maximum number of URLs to process
4125
- analysis_prompt (Optional[str]): Custom prompt for analysis
4126
- system_prompt (Optional[str]): Custom system prompt
4127
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4128
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4129
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4130
-
4131
- Returns:
4132
- DeepResearchStatusResponse containing:
4133
- * success (bool): Whether research completed successfully
4134
- * status (str): Current state (processing/completed/failed)
4135
- * error (Optional[str]): Error message if failed
4136
- * id (str): Unique identifier for the research job
4137
- * data (Any): Research findings and analysis
4138
- * sources (List[Dict]): List of discovered sources
4139
- * activities (List[Dict]): Research progress log
4140
- * summaries (List[str]): Generated research summaries
4141
-
4142
- Raises:
4143
- Exception: If research fails
4144
- """
4145
- research_params = {}
4146
- if max_depth is not None:
4147
- research_params['maxDepth'] = max_depth
4148
- if time_limit is not None:
4149
- research_params['timeLimit'] = time_limit
4150
- if max_urls is not None:
4151
- research_params['maxUrls'] = max_urls
4152
- if analysis_prompt is not None:
4153
- research_params['analysisPrompt'] = analysis_prompt
4154
- if system_prompt is not None:
4155
- research_params['systemPrompt'] = system_prompt
4156
- if __experimental_stream_steps is not None:
4157
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4158
- research_params = DeepResearchParams(**research_params)
4159
-
4160
- response = await self.async_deep_research(
4161
- query,
4162
- max_depth=max_depth,
4163
- time_limit=time_limit,
4164
- max_urls=max_urls,
4165
- analysis_prompt=analysis_prompt,
4166
- system_prompt=system_prompt
4167
- )
4168
- if not response.get('success') or 'id' not in response:
4169
- return response
4170
-
4171
- job_id = response['id']
4172
- last_activity_count = 0
4173
- last_source_count = 0
4174
-
4175
- while True:
4176
- status = await self.check_deep_research_status(job_id)
4177
-
4178
- if on_activity and 'activities' in status:
4179
- new_activities = status['activities'][last_activity_count:]
4180
- for activity in new_activities:
4181
- on_activity(activity)
4182
- last_activity_count = len(status['activities'])
4183
-
4184
- if on_source and 'sources' in status:
4185
- new_sources = status['sources'][last_source_count:]
4186
- for source in new_sources:
4187
- on_source(source)
4188
- last_source_count = len(status['sources'])
4189
-
4190
- if status['status'] == 'completed':
4191
- return status
4192
- elif status['status'] == 'failed':
4193
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
4194
- elif status['status'] != 'processing':
4195
- break
4196
-
4197
- await asyncio.sleep(2)
4198
-
4199
- return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4200
-
4201
- async def async_deep_research(
4202
- self,
4203
- query: str,
4204
- *,
4205
- max_depth: Optional[int] = None,
4206
- time_limit: Optional[int] = None,
4207
- max_urls: Optional[int] = None,
4208
- analysis_prompt: Optional[str] = None,
4209
- system_prompt: Optional[str] = None,
4210
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4211
- """
4212
- Initiates an asynchronous deep research operation.
4213
-
4214
- Args:
4215
- query (str): Research query or topic to investigate
4216
- max_depth (Optional[int]): Maximum depth of research exploration
4217
- time_limit (Optional[int]): Time limit in seconds for research
4218
- max_urls (Optional[int]): Maximum number of URLs to process
4219
- analysis_prompt (Optional[str]): Custom prompt for analysis
4220
- system_prompt (Optional[str]): Custom system prompt
4221
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4222
-
4223
- Returns:
4224
- Dict[str, Any]: A response containing:
4225
- * success (bool): Whether the research initiation was successful
4226
- * id (str): The unique identifier for the research job
4227
- * error (str, optional): Error message if initiation failed
4228
-
4229
- Raises:
4230
- Exception: If the research initiation fails.
4231
- """
4232
- research_params = {}
4233
- if max_depth is not None:
4234
- research_params['maxDepth'] = max_depth
4235
- if time_limit is not None:
4236
- research_params['timeLimit'] = time_limit
4237
- if max_urls is not None:
4238
- research_params['maxUrls'] = max_urls
4239
- if analysis_prompt is not None:
4240
- research_params['analysisPrompt'] = analysis_prompt
4241
- if system_prompt is not None:
4242
- research_params['systemPrompt'] = system_prompt
4243
- if __experimental_stream_steps is not None:
4244
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4245
- research_params = DeepResearchParams(**research_params)
4246
-
4247
- headers = self._prepare_headers()
4248
-
4249
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
4250
- json_data['origin'] = f"python-sdk@{version}"
4251
-
4252
- try:
4253
- return await self._async_post_request(
4254
- f'{self.api_url}/v1/deep-research',
4255
- json_data,
4256
- headers
4257
- )
4258
- except Exception as e:
4259
- raise ValueError(str(e))
4260
-
4261
- async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4262
- """
4263
- Check the status of a deep research operation.
4264
-
4265
- Args:
4266
- id (str): The ID of the deep research operation.
4267
-
4268
- Returns:
4269
- DeepResearchResponse containing:
4270
-
4271
- Status:
4272
- * success - Whether research completed successfully
4273
- * status - Current state (processing/completed/failed)
4274
- * error - Error message if failed
4275
-
4276
- Results:
4277
- * id - Unique identifier for the research job
4278
- * data - Research findings and analysis
4279
- * sources - List of discovered sources
4280
- * activities - Research progress log
4281
- * summaries - Generated research summaries
4282
-
4283
- Raises:
4284
- Exception: If the status check fails.
4285
- """
4286
- headers = self._prepare_headers()
4287
- try:
4288
- return await self._async_get_request(
4289
- f'{self.api_url}/v1/deep-research/{id}',
4290
- headers
4291
- )
4292
- except Exception as e:
4293
- raise ValueError(str(e))
4294
-
4295
- async def search(
4296
- self,
4297
- query: str,
4298
- *,
4299
- limit: Optional[int] = None,
4300
- tbs: Optional[str] = None,
4301
- filter: Optional[str] = None,
4302
- lang: Optional[str] = None,
4303
- country: Optional[str] = None,
4304
- location: Optional[str] = None,
4305
- timeout: Optional[int] = None,
4306
- scrape_options: Optional[ScrapeOptions] = None,
4307
- params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4308
- **kwargs) -> SearchResponse:
4309
- """
4310
- Asynchronously search for content using Firecrawl.
4311
-
4312
- Args:
4313
- query (str): Search query string
4314
- limit (Optional[int]): Max results (default: 5)
4315
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
4316
- filter (Optional[str]): Custom result filter
4317
- lang (Optional[str]): Language code (default: "en")
4318
- country (Optional[str]): Country code (default: "us")
4319
- location (Optional[str]): Geo-targeting
4320
- timeout (Optional[int]): Request timeout in milliseconds
4321
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4322
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4323
- **kwargs: Additional keyword arguments for future compatibility
4324
-
4325
- Returns:
4326
- SearchResponse: Response containing:
4327
- * success (bool): Whether request succeeded
4328
- * data (List[FirecrawlDocument]): Search results
4329
- * warning (Optional[str]): Warning message if any
4330
- * error (Optional[str]): Error message if any
4331
-
4332
- Raises:
4333
- Exception: If search fails or response cannot be parsed
4334
- """
4335
- # Build search parameters
4336
- search_params = {}
4337
- if params:
4338
- if isinstance(params, dict):
4339
- search_params.update(params)
4340
- else:
4341
- search_params.update(params.dict(exclude_none=True))
4342
-
4343
- # Add individual parameters
4344
- if limit is not None:
4345
- search_params['limit'] = limit
4346
- if tbs is not None:
4347
- search_params['tbs'] = tbs
4348
- if filter is not None:
4349
- search_params['filter'] = filter
4350
- if lang is not None:
4351
- search_params['lang'] = lang
4352
- if country is not None:
4353
- search_params['country'] = country
4354
- if location is not None:
4355
- search_params['location'] = location
4356
- if timeout is not None:
4357
- search_params['timeout'] = timeout
4358
- if scrape_options is not None:
4359
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4360
-
4361
- # Add any additional kwargs
4362
- search_params.update(kwargs)
4363
-
4364
- # Create final params object
4365
- final_params = SearchParams(query=query, **search_params)
4366
- params_dict = final_params.dict(exclude_none=True)
4367
- params_dict['origin'] = f"python-sdk@{version}"
4368
-
4369
- return await self._async_post_request(
4370
- f"{self.api_url}/v1/search",
4371
- params_dict,
4372
- {"Authorization": f"Bearer {self.api_key}"}
4373
- )
4374
-
4375
- class AsyncCrawlWatcher(CrawlWatcher):
4376
- """
4377
- Async version of CrawlWatcher that properly handles async operations.
4378
- """
4379
- def __init__(self, id: str, app: AsyncFirecrawlApp):
4380
- super().__init__(id, app)
4381
-
4382
- async def connect(self) -> None:
4383
- """
4384
- Establishes async WebSocket connection and starts listening for messages.
4385
- """
4386
- async with websockets.connect(
4387
- self.ws_url,
4388
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4389
- ) as websocket:
4390
- await self._listen(websocket)
4391
-
4392
- async def _listen(self, websocket) -> None:
4393
- """
4394
- Listens for incoming WebSocket messages and handles them asynchronously.
4395
-
4396
- Args:
4397
- websocket: The WebSocket connection object
4398
- """
4399
- async for message in websocket:
4400
- msg = json.loads(message)
4401
- await self._handle_message(msg)
4402
-
4403
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
4404
- """
4405
- Handles incoming WebSocket messages based on their type asynchronously.
4406
-
4407
- Args:
4408
- msg (Dict[str, Any]): The message to handle
4409
- """
4410
- if msg['type'] == 'done':
4411
- self.status = 'completed'
4412
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4413
- elif msg['type'] == 'error':
4414
- self.status = 'failed'
4415
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4416
- elif msg['type'] == 'catchup':
4417
- self.status = msg['data']['status']
4418
- self.data.extend(msg['data'].get('data', []))
4419
- for doc in self.data:
4420
- self.dispatch_event('document', {'data': doc, 'id': self.id})
4421
- elif msg['type'] == 'document':
4422
- self.data.append(msg['data'])
4423
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4424
-
4425
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4426
- """
4427
- Handle errors from async API responses.
4428
- """
4429
- try:
4430
- error_data = await response.json()
4431
- error_message = error_data.get('error', 'No error message provided.')
4432
- error_details = error_data.get('details', 'No additional error details provided.')
4433
- except:
4434
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4435
-
4436
- # Use the app's method to get the error message
4437
- message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4438
-
4439
- raise aiohttp.ClientError(message)
4440
-
4441
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4442
- """
4443
- Generate a standardized error message based on HTTP status code for async operations.
4444
-
4445
- Args:
4446
- status_code (int): The HTTP status code from the response
4447
- action (str): Description of the action that was being performed
4448
- error_message (str): The error message from the API response
4449
- error_details (str): Additional error details from the API response
4450
-
4451
- Returns:
4452
- str: A formatted error message
4453
- """
4454
- return self._get_error_message(status_code, action, error_message, error_details)