firecrawl-py 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (37) hide show
  1. firecrawl/__init__.py +1 -1
  2. firecrawl/__tests__/v1/e2e_withAuth/test.py +25 -0
  3. firecrawl/firecrawl.py +68 -15
  4. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/LICENSE +0 -0
  5. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/METADATA +1 -1
  6. firecrawl_py-2.10.0.dist-info/RECORD +12 -0
  7. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/top_level.txt +0 -2
  8. build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  9. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  10. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  11. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  12. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  13. build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
  14. build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  15. build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  16. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  17. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  18. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  19. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  20. build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
  21. build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  22. build/lib/build/lib/firecrawl/__init__.py +0 -79
  23. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  24. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  25. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  26. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  27. build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
  28. build/lib/build/lib/tests/test_change_tracking.py +0 -98
  29. build/lib/firecrawl/__init__.py +0 -79
  30. build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  31. build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  32. build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  33. build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  34. build/lib/firecrawl/firecrawl.py +0 -4480
  35. build/lib/tests/test_change_tracking.py +0 -98
  36. firecrawl_py-2.8.0.dist-info/RECORD +0 -40
  37. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/WHEEL +0 -0
@@ -1,4480 +0,0 @@
1
- """
2
- FirecrawlApp Module
3
-
4
- This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs. The module uses requests for HTTP communication
7
- and handles retries for certain HTTP status codes.
8
-
9
- Classes:
10
- - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
- """
12
- import logging
13
- import os
14
- import time
15
- from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
- import json
17
- from datetime import datetime
18
- import re
19
- import warnings
20
- import requests
21
- import pydantic
22
- import websockets
23
- import aiohttp
24
- import asyncio
25
- from pydantic import Field
26
-
27
- # Suppress Pydantic warnings about attribute shadowing
28
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
-
34
- def get_version():
35
- try:
36
- from pathlib import Path
37
- package_path = os.path.dirname(__file__)
38
- version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
- if version_match:
41
- return version_match.group(1).strip()
42
- except Exception:
43
- print("Failed to get version from __init__.py")
44
- return None
45
-
46
- version = get_version()
47
-
48
- logger : logging.Logger = logging.getLogger("firecrawl")
49
-
50
- T = TypeVar('T')
51
-
52
- # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
- # """Metadata for a Firecrawl document."""
54
- # title: Optional[str] = None
55
- # description: Optional[str] = None
56
- # language: Optional[str] = None
57
- # keywords: Optional[str] = None
58
- # robots: Optional[str] = None
59
- # ogTitle: Optional[str] = None
60
- # ogDescription: Optional[str] = None
61
- # ogUrl: Optional[str] = None
62
- # ogImage: Optional[str] = None
63
- # ogAudio: Optional[str] = None
64
- # ogDeterminer: Optional[str] = None
65
- # ogLocale: Optional[str] = None
66
- # ogLocaleAlternate: Optional[List[str]] = None
67
- # ogSiteName: Optional[str] = None
68
- # ogVideo: Optional[str] = None
69
- # dctermsCreated: Optional[str] = None
70
- # dcDateCreated: Optional[str] = None
71
- # dcDate: Optional[str] = None
72
- # dctermsType: Optional[str] = None
73
- # dcType: Optional[str] = None
74
- # dctermsAudience: Optional[str] = None
75
- # dctermsSubject: Optional[str] = None
76
- # dcSubject: Optional[str] = None
77
- # dcDescription: Optional[str] = None
78
- # dctermsKeywords: Optional[str] = None
79
- # modifiedTime: Optional[str] = None
80
- # publishedTime: Optional[str] = None
81
- # articleTag: Optional[str] = None
82
- # articleSection: Optional[str] = None
83
- # sourceURL: Optional[str] = None
84
- # statusCode: Optional[int] = None
85
- # error: Optional[str] = None
86
-
87
- class AgentOptions(pydantic.BaseModel):
88
- """Configuration for the agent."""
89
- model: Literal["FIRE-1"] = "FIRE-1"
90
- prompt: Optional[str] = None
91
-
92
- class AgentOptionsExtract(pydantic.BaseModel):
93
- """Configuration for the agent in extract operations."""
94
- model: Literal["FIRE-1"] = "FIRE-1"
95
-
96
- class ActionsResult(pydantic.BaseModel):
97
- """Result of actions performed during scraping."""
98
- screenshots: List[str]
99
-
100
- class ChangeTrackingData(pydantic.BaseModel):
101
- """
102
- Data for the change tracking format.
103
- """
104
- previousScrapeAt: Optional[str] = None
105
- changeStatus: str # "new" | "same" | "changed" | "removed"
106
- visibility: str # "visible" | "hidden"
107
- diff: Optional[Dict[str, Any]] = None
108
- json: Optional[Any] = None
109
-
110
- class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
- """Document retrieved or processed by Firecrawl."""
112
- url: Optional[str] = None
113
- markdown: Optional[str] = None
114
- html: Optional[str] = None
115
- rawHtml: Optional[str] = None
116
- links: Optional[List[str]] = None
117
- extract: Optional[T] = None
118
- json: Optional[T] = None
119
- screenshot: Optional[str] = None
120
- metadata: Optional[Any] = None
121
- actions: Optional[ActionsResult] = None
122
- title: Optional[str] = None # v1 search only
123
- description: Optional[str] = None # v1 search only
124
- changeTracking: Optional[ChangeTrackingData] = None
125
-
126
- class LocationConfig(pydantic.BaseModel):
127
- """Location configuration for scraping."""
128
- country: Optional[str] = None
129
- languages: Optional[List[str]] = None
130
-
131
- class WebhookConfig(pydantic.BaseModel):
132
- """Configuration for webhooks."""
133
- url: str
134
- headers: Optional[Dict[str, str]] = None
135
- metadata: Optional[Dict[str, str]] = None
136
- events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
-
138
- class ChangeTrackingOptions(pydantic.BaseModel):
139
- """Configuration for change tracking."""
140
- modes: Optional[List[Literal["git-diff", "json"]]] = None
141
- schema: Optional[Any] = None
142
- prompt: Optional[str] = None
143
- tag: Optional[str] = None
144
-
145
- class ScrapeOptions(pydantic.BaseModel):
146
- """Parameters for scraping operations."""
147
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
148
- headers: Optional[Dict[str, str]] = None
149
- includeTags: Optional[List[str]] = None
150
- excludeTags: Optional[List[str]] = None
151
- onlyMainContent: Optional[bool] = None
152
- waitFor: Optional[int] = None
153
- timeout: Optional[int] = None
154
- location: Optional[LocationConfig] = None
155
- mobile: Optional[bool] = None
156
- skipTlsVerification: Optional[bool] = None
157
- removeBase64Images: Optional[bool] = None
158
- blockAds: Optional[bool] = None
159
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None
160
- changeTrackingOptions: Optional[ChangeTrackingOptions] = None
161
- maxAge: Optional[int] = None
162
- storeInCache: Optional[bool] = None
163
-
164
- class WaitAction(pydantic.BaseModel):
165
- """Wait action to perform during scraping."""
166
- type: Literal["wait"]
167
- milliseconds: Optional[int] = None
168
- selector: Optional[str] = None
169
-
170
- class ScreenshotAction(pydantic.BaseModel):
171
- """Screenshot action to perform during scraping."""
172
- type: Literal["screenshot"]
173
- fullPage: Optional[bool] = None
174
-
175
- class ClickAction(pydantic.BaseModel):
176
- """Click action to perform during scraping."""
177
- type: Literal["click"]
178
- selector: str
179
-
180
- class WriteAction(pydantic.BaseModel):
181
- """Write action to perform during scraping."""
182
- type: Literal["write"]
183
- text: str
184
-
185
- class PressAction(pydantic.BaseModel):
186
- """Press action to perform during scraping."""
187
- type: Literal["press"]
188
- key: str
189
-
190
- class ScrollAction(pydantic.BaseModel):
191
- """Scroll action to perform during scraping."""
192
- type: Literal["scroll"]
193
- direction: Literal["up", "down"]
194
- selector: Optional[str] = None
195
-
196
- class ScrapeAction(pydantic.BaseModel):
197
- """Scrape action to perform during scraping."""
198
- type: Literal["scrape"]
199
-
200
- class ExecuteJavascriptAction(pydantic.BaseModel):
201
- """Execute javascript action to perform during scraping."""
202
- type: Literal["executeJavascript"]
203
- script: str
204
-
205
-
206
- class ExtractAgent(pydantic.BaseModel):
207
- """Configuration for the agent in extract operations."""
208
- model: Literal["FIRE-1"] = "FIRE-1"
209
-
210
- class JsonConfig(pydantic.BaseModel):
211
- """Configuration for extraction."""
212
- prompt: Optional[str] = None
213
- schema: Optional[Any] = None
214
- systemPrompt: Optional[str] = None
215
- agent: Optional[ExtractAgent] = None
216
-
217
- class ScrapeParams(ScrapeOptions):
218
- """Parameters for scraping operations."""
219
- extract: Optional[JsonConfig] = None
220
- jsonOptions: Optional[JsonConfig] = None
221
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
222
- agent: Optional[AgentOptions] = None
223
- webhook: Optional[WebhookConfig] = None
224
-
225
- class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
226
- """Response from scraping operations."""
227
- success: bool = True
228
- warning: Optional[str] = None
229
- error: Optional[str] = None
230
-
231
- class BatchScrapeResponse(pydantic.BaseModel):
232
- """Response from batch scrape operations."""
233
- id: Optional[str] = None
234
- url: Optional[str] = None
235
- success: bool = True
236
- error: Optional[str] = None
237
- invalidURLs: Optional[List[str]] = None
238
-
239
- class BatchScrapeStatusResponse(pydantic.BaseModel):
240
- """Response from batch scrape status checks."""
241
- success: bool = True
242
- status: Literal["scraping", "completed", "failed", "cancelled"]
243
- completed: int
244
- total: int
245
- creditsUsed: int
246
- expiresAt: datetime
247
- next: Optional[str] = None
248
- data: List[FirecrawlDocument]
249
-
250
- class CrawlParams(pydantic.BaseModel):
251
- """Parameters for crawling operations."""
252
- includePaths: Optional[List[str]] = None
253
- excludePaths: Optional[List[str]] = None
254
- maxDepth: Optional[int] = None
255
- maxDiscoveryDepth: Optional[int] = None
256
- limit: Optional[int] = None
257
- allowBackwardLinks: Optional[bool] = None
258
- allowExternalLinks: Optional[bool] = None
259
- ignoreSitemap: Optional[bool] = None
260
- scrapeOptions: Optional[ScrapeOptions] = None
261
- webhook: Optional[Union[str, WebhookConfig]] = None
262
- deduplicateSimilarURLs: Optional[bool] = None
263
- ignoreQueryParameters: Optional[bool] = None
264
- regexOnFullURL: Optional[bool] = None
265
- delay: Optional[int] = None # Delay in seconds between scrapes
266
-
267
- class CrawlResponse(pydantic.BaseModel):
268
- """Response from crawling operations."""
269
- id: Optional[str] = None
270
- url: Optional[str] = None
271
- success: bool = True
272
- error: Optional[str] = None
273
-
274
- class CrawlStatusResponse(pydantic.BaseModel):
275
- """Response from crawl status checks."""
276
- success: bool = True
277
- status: Literal["scraping", "completed", "failed", "cancelled"]
278
- completed: int
279
- total: int
280
- creditsUsed: int
281
- expiresAt: datetime
282
- next: Optional[str] = None
283
- data: List[FirecrawlDocument]
284
-
285
- class CrawlErrorsResponse(pydantic.BaseModel):
286
- """Response from crawl/batch scrape error monitoring."""
287
- errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
288
- robotsBlocked: List[str]
289
-
290
- class MapParams(pydantic.BaseModel):
291
- """Parameters for mapping operations."""
292
- search: Optional[str] = None
293
- ignoreSitemap: Optional[bool] = None
294
- includeSubdomains: Optional[bool] = None
295
- sitemapOnly: Optional[bool] = None
296
- limit: Optional[int] = None
297
- timeout: Optional[int] = None
298
- useIndex: Optional[bool] = None
299
-
300
- class MapResponse(pydantic.BaseModel):
301
- """Response from mapping operations."""
302
- success: bool = True
303
- links: Optional[List[str]] = None
304
- error: Optional[str] = None
305
-
306
- class ExtractParams(pydantic.BaseModel):
307
- """Parameters for extracting information from URLs."""
308
- prompt: Optional[str] = None
309
- schema: Optional[Any] = None
310
- systemPrompt: Optional[str] = None
311
- allowExternalLinks: Optional[bool] = None
312
- enableWebSearch: Optional[bool] = None
313
- includeSubdomains: Optional[bool] = None
314
- origin: Optional[str] = None
315
- showSources: Optional[bool] = None
316
- scrapeOptions: Optional[ScrapeOptions] = None
317
-
318
- class ExtractResponse(pydantic.BaseModel, Generic[T]):
319
- """Response from extract operations."""
320
- id: Optional[str] = None
321
- status: Optional[Literal["processing", "completed", "failed"]] = None
322
- expiresAt: Optional[datetime] = None
323
- success: bool = True
324
- data: Optional[T] = None
325
- error: Optional[str] = None
326
- warning: Optional[str] = None
327
- sources: Optional[List[str]] = None
328
-
329
- class SearchParams(pydantic.BaseModel):
330
- query: str
331
- limit: Optional[int] = 5
332
- tbs: Optional[str] = None
333
- filter: Optional[str] = None
334
- lang: Optional[str] = "en"
335
- country: Optional[str] = "us"
336
- location: Optional[str] = None
337
- origin: Optional[str] = "api"
338
- timeout: Optional[int] = 60000
339
- scrapeOptions: Optional[ScrapeOptions] = None
340
-
341
- class SearchResponse(pydantic.BaseModel):
342
- """Response from search operations."""
343
- success: bool = True
344
- data: List[FirecrawlDocument]
345
- warning: Optional[str] = None
346
- error: Optional[str] = None
347
-
348
- class GenerateLLMsTextParams(pydantic.BaseModel):
349
- """
350
- Parameters for the LLMs.txt generation operation.
351
- """
352
- maxUrls: Optional[int] = 10
353
- showFullText: Optional[bool] = False
354
- cache: Optional[bool] = True
355
- __experimental_stream: Optional[bool] = None
356
-
357
- class DeepResearchParams(pydantic.BaseModel):
358
- """
359
- Parameters for the deep research operation.
360
- """
361
- maxDepth: Optional[int] = 7
362
- timeLimit: Optional[int] = 270
363
- maxUrls: Optional[int] = 20
364
- analysisPrompt: Optional[str] = None
365
- systemPrompt: Optional[str] = None
366
- __experimental_streamSteps: Optional[bool] = None
367
-
368
- class DeepResearchResponse(pydantic.BaseModel):
369
- """
370
- Response from the deep research operation.
371
- """
372
- success: bool
373
- id: str
374
- error: Optional[str] = None
375
-
376
- class DeepResearchStatusResponse(pydantic.BaseModel):
377
- """
378
- Status response from the deep research operation.
379
- """
380
- success: bool
381
- data: Optional[Dict[str, Any]] = None
382
- status: str
383
- error: Optional[str] = None
384
- expiresAt: str
385
- currentDepth: int
386
- maxDepth: int
387
- activities: List[Dict[str, Any]]
388
- sources: List[Dict[str, Any]]
389
- summaries: List[str]
390
-
391
- class GenerateLLMsTextResponse(pydantic.BaseModel):
392
- """Response from LLMs.txt generation operations."""
393
- success: bool = True
394
- id: str
395
- error: Optional[str] = None
396
-
397
- class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
398
- llmstxt: str
399
- llmsfulltxt: Optional[str] = None
400
-
401
- class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
402
- """Status response from LLMs.txt generation operations."""
403
- success: bool = True
404
- data: Optional[GenerateLLMsTextStatusResponseData] = None
405
- status: Literal["processing", "completed", "failed"]
406
- error: Optional[str] = None
407
- expiresAt: str
408
-
409
- class SearchResponse(pydantic.BaseModel):
410
- """
411
- Response from the search operation.
412
- """
413
- success: bool
414
- data: List[Dict[str, Any]]
415
- warning: Optional[str] = None
416
- error: Optional[str] = None
417
-
418
- class ExtractParams(pydantic.BaseModel):
419
- """
420
- Parameters for the extract operation.
421
- """
422
- prompt: Optional[str] = None
423
- schema: Optional[Any] = pydantic.Field(None, alias='schema')
424
- system_prompt: Optional[str] = None
425
- allow_external_links: Optional[bool] = False
426
- enable_web_search: Optional[bool] = False
427
- # Just for backwards compatibility
428
- enableWebSearch: Optional[bool] = False
429
- show_sources: Optional[bool] = False
430
- agent: Optional[Dict[str, Any]] = None
431
-
432
- class FirecrawlApp:
433
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
434
- """
435
- Initialize the FirecrawlApp instance with API key, API URL.
436
-
437
- Args:
438
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
439
- api_url (Optional[str]): Base URL for the Firecrawl API.
440
- """
441
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
442
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
443
-
444
- # Only require API key when using cloud service
445
- if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
446
- logger.warning("No API key provided for cloud service")
447
- raise ValueError('No API key provided')
448
-
449
- logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
450
-
451
- def scrape_url(
452
- self,
453
- url: str,
454
- *,
455
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
456
- include_tags: Optional[List[str]] = None,
457
- exclude_tags: Optional[List[str]] = None,
458
- only_main_content: Optional[bool] = None,
459
- wait_for: Optional[int] = None,
460
- timeout: Optional[int] = None,
461
- location: Optional[LocationConfig] = None,
462
- mobile: Optional[bool] = None,
463
- skip_tls_verification: Optional[bool] = None,
464
- remove_base64_images: Optional[bool] = None,
465
- block_ads: Optional[bool] = None,
466
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
467
- extract: Optional[JsonConfig] = None,
468
- json_options: Optional[JsonConfig] = None,
469
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
470
- change_tracking_options: Optional[ChangeTrackingOptions] = None,
471
- max_age: Optional[int] = None,
472
- store_in_cache: Optional[bool] = None,
473
- **kwargs) -> ScrapeResponse[Any]:
474
- """
475
- Scrape and extract content from a URL.
476
-
477
- Args:
478
- url (str): Target URL to scrape
479
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
480
- include_tags (Optional[List[str]]): HTML tags to include
481
- exclude_tags (Optional[List[str]]): HTML tags to exclude
482
- only_main_content (Optional[bool]): Extract main content only
483
- wait_for (Optional[int]): Wait for a specific element to appear
484
- timeout (Optional[int]): Request timeout (ms)
485
- location (Optional[LocationConfig]): Location configuration
486
- mobile (Optional[bool]): Use mobile user agent
487
- skip_tls_verification (Optional[bool]): Skip TLS verification
488
- remove_base64_images (Optional[bool]): Remove base64 images
489
- block_ads (Optional[bool]): Block ads
490
- proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
491
- extract (Optional[JsonConfig]): Content extraction settings
492
- json_options (Optional[JsonConfig]): JSON extraction settings
493
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
494
- change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
495
-
496
-
497
- Returns:
498
- ScrapeResponse with:
499
- * Requested content formats
500
- * Page metadata
501
- * Extraction results
502
- * Success/error status
503
-
504
- Raises:
505
- Exception: If scraping fails
506
- """
507
- headers = self._prepare_headers()
508
-
509
- # Build scrape parameters
510
- scrape_params = {
511
- 'url': url,
512
- 'origin': f"python-sdk@{version}"
513
- }
514
-
515
- # Add optional parameters if provided
516
- if formats:
517
- scrape_params['formats'] = formats
518
- if include_tags:
519
- scrape_params['includeTags'] = include_tags
520
- if exclude_tags:
521
- scrape_params['excludeTags'] = exclude_tags
522
- if only_main_content is not None:
523
- scrape_params['onlyMainContent'] = only_main_content
524
- if wait_for:
525
- scrape_params['waitFor'] = wait_for
526
- if timeout:
527
- scrape_params['timeout'] = timeout
528
- if location:
529
- scrape_params['location'] = location.dict(exclude_none=True)
530
- if mobile is not None:
531
- scrape_params['mobile'] = mobile
532
- if skip_tls_verification is not None:
533
- scrape_params['skipTlsVerification'] = skip_tls_verification
534
- if remove_base64_images is not None:
535
- scrape_params['removeBase64Images'] = remove_base64_images
536
- if block_ads is not None:
537
- scrape_params['blockAds'] = block_ads
538
- if proxy:
539
- scrape_params['proxy'] = proxy
540
- if extract is not None:
541
- extract = self._ensure_schema_dict(extract)
542
- if isinstance(extract, dict) and "schema" in extract:
543
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
544
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
545
- if json_options is not None:
546
- json_options = self._ensure_schema_dict(json_options)
547
- if isinstance(json_options, dict) and "schema" in json_options:
548
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
549
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
550
- if actions:
551
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
552
- if change_tracking_options:
553
- scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
554
- if max_age is not None:
555
- scrape_params['maxAge'] = max_age
556
- if store_in_cache is not None:
557
- scrape_params['storeInCache'] = store_in_cache
558
-
559
- scrape_params.update(kwargs)
560
-
561
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
562
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
563
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
564
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
565
-
566
- # Make request
567
- response = requests.post(
568
- f'{self.api_url}/v1/scrape',
569
- headers=headers,
570
- json=scrape_params,
571
- timeout=(timeout + 5000 if timeout else None)
572
- )
573
-
574
- if response.status_code == 200:
575
- try:
576
- response_json = response.json()
577
- if response_json.get('success') and 'data' in response_json:
578
- return ScrapeResponse(**response_json['data'])
579
- elif "error" in response_json:
580
- raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
581
- else:
582
- raise Exception(f'Failed to scrape URL. Error: {response_json}')
583
- except ValueError:
584
- raise Exception('Failed to parse Firecrawl response as JSON.')
585
- else:
586
- self._handle_error(response, 'scrape URL')
587
-
588
- def search(
589
- self,
590
- query: str,
591
- *,
592
- limit: Optional[int] = None,
593
- tbs: Optional[str] = None,
594
- filter: Optional[str] = None,
595
- lang: Optional[str] = None,
596
- country: Optional[str] = None,
597
- location: Optional[str] = None,
598
- timeout: Optional[int] = None,
599
- scrape_options: Optional[ScrapeOptions] = None,
600
- **kwargs) -> SearchResponse:
601
- """
602
- Search for content using Firecrawl.
603
-
604
- Args:
605
- query (str): Search query string
606
- limit (Optional[int]): Max results (default: 5)
607
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
608
- filter (Optional[str]): Custom result filter
609
- lang (Optional[str]): Language code (default: "en")
610
- country (Optional[str]): Country code (default: "us")
611
- location (Optional[str]): Geo-targeting
612
- timeout (Optional[int]): Request timeout in milliseconds
613
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
614
- **kwargs: Additional keyword arguments for future compatibility
615
-
616
- Returns:
617
- SearchResponse: Response containing:
618
- * success (bool): Whether request succeeded
619
- * data (List[FirecrawlDocument]): Search results
620
- * warning (Optional[str]): Warning message if any
621
- * error (Optional[str]): Error message if any
622
-
623
- Raises:
624
- Exception: If search fails or response cannot be parsed
625
- """
626
- # Validate any additional kwargs
627
- self._validate_kwargs(kwargs, "search")
628
-
629
- # Build search parameters
630
- search_params = {}
631
-
632
- # Add individual parameters
633
- if limit is not None:
634
- search_params['limit'] = limit
635
- if tbs is not None:
636
- search_params['tbs'] = tbs
637
- if filter is not None:
638
- search_params['filter'] = filter
639
- if lang is not None:
640
- search_params['lang'] = lang
641
- if country is not None:
642
- search_params['country'] = country
643
- if location is not None:
644
- search_params['location'] = location
645
- if timeout is not None:
646
- search_params['timeout'] = timeout
647
- if scrape_options is not None:
648
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
649
-
650
- # Add any additional kwargs
651
- search_params.update(kwargs)
652
-
653
- # Create final params object
654
- final_params = SearchParams(query=query, **search_params)
655
- params_dict = final_params.dict(exclude_none=True)
656
- params_dict['origin'] = f"python-sdk@{version}"
657
-
658
- # Make request
659
- response = requests.post(
660
- f"{self.api_url}/v1/search",
661
- headers={"Authorization": f"Bearer {self.api_key}"},
662
- json=params_dict
663
- )
664
-
665
- if response.status_code == 200:
666
- try:
667
- response_json = response.json()
668
- if response_json.get('success') and 'data' in response_json:
669
- return SearchResponse(**response_json)
670
- elif "error" in response_json:
671
- raise Exception(f'Search failed. Error: {response_json["error"]}')
672
- else:
673
- raise Exception(f'Search failed. Error: {response_json}')
674
- except ValueError:
675
- raise Exception('Failed to parse Firecrawl response as JSON.')
676
- else:
677
- self._handle_error(response, 'search')
678
-
679
- def crawl_url(
680
- self,
681
- url: str,
682
- *,
683
- include_paths: Optional[List[str]] = None,
684
- exclude_paths: Optional[List[str]] = None,
685
- max_depth: Optional[int] = None,
686
- max_discovery_depth: Optional[int] = None,
687
- limit: Optional[int] = None,
688
- allow_backward_links: Optional[bool] = None,
689
- allow_external_links: Optional[bool] = None,
690
- ignore_sitemap: Optional[bool] = None,
691
- scrape_options: Optional[ScrapeOptions] = None,
692
- webhook: Optional[Union[str, WebhookConfig]] = None,
693
- deduplicate_similar_urls: Optional[bool] = None,
694
- ignore_query_parameters: Optional[bool] = None,
695
- regex_on_full_url: Optional[bool] = None,
696
- delay: Optional[int] = None,
697
- poll_interval: Optional[int] = 2,
698
- idempotency_key: Optional[str] = None,
699
- **kwargs
700
- ) -> CrawlStatusResponse:
701
- """
702
- Crawl a website starting from a URL.
703
-
704
- Args:
705
- url (str): Target URL to start crawling from
706
- include_paths (Optional[List[str]]): Patterns of URLs to include
707
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
708
- max_depth (Optional[int]): Maximum crawl depth
709
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
710
- limit (Optional[int]): Maximum pages to crawl
711
- allow_backward_links (Optional[bool]): Follow parent directory links
712
- allow_external_links (Optional[bool]): Follow external domain links
713
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
714
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
715
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
716
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
717
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
718
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
719
- delay (Optional[int]): Delay in seconds between scrapes
720
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
721
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
722
- **kwargs: Additional parameters to pass to the API
723
-
724
- Returns:
725
- CrawlStatusResponse with:
726
- * Crawling status and progress
727
- * Crawled page contents
728
- * Success/error information
729
-
730
- Raises:
731
- Exception: If crawl fails
732
- """
733
- # Validate any additional kwargs
734
- self._validate_kwargs(kwargs, "crawl_url")
735
-
736
- crawl_params = {}
737
-
738
- # Add individual parameters
739
- if include_paths is not None:
740
- crawl_params['includePaths'] = include_paths
741
- if exclude_paths is not None:
742
- crawl_params['excludePaths'] = exclude_paths
743
- if max_depth is not None:
744
- crawl_params['maxDepth'] = max_depth
745
- if max_discovery_depth is not None:
746
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
747
- if limit is not None:
748
- crawl_params['limit'] = limit
749
- if allow_backward_links is not None:
750
- crawl_params['allowBackwardLinks'] = allow_backward_links
751
- if allow_external_links is not None:
752
- crawl_params['allowExternalLinks'] = allow_external_links
753
- if ignore_sitemap is not None:
754
- crawl_params['ignoreSitemap'] = ignore_sitemap
755
- if scrape_options is not None:
756
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
757
- if webhook is not None:
758
- crawl_params['webhook'] = webhook
759
- if deduplicate_similar_urls is not None:
760
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
761
- if ignore_query_parameters is not None:
762
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
763
- if regex_on_full_url is not None:
764
- crawl_params['regexOnFullURL'] = regex_on_full_url
765
- if delay is not None:
766
- crawl_params['delay'] = delay
767
-
768
- # Add any additional kwargs
769
- crawl_params.update(kwargs)
770
-
771
- # Create final params object
772
- final_params = CrawlParams(**crawl_params)
773
- params_dict = final_params.dict(exclude_none=True)
774
- params_dict['url'] = url
775
- params_dict['origin'] = f"python-sdk@{version}"
776
-
777
- # Make request
778
- headers = self._prepare_headers(idempotency_key)
779
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
780
-
781
- if response.status_code == 200:
782
- try:
783
- id = response.json().get('id')
784
- except:
785
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
786
- return self._monitor_job_status(id, headers, poll_interval)
787
- else:
788
- self._handle_error(response, 'start crawl job')
789
-
790
- def async_crawl_url(
791
- self,
792
- url: str,
793
- *,
794
- include_paths: Optional[List[str]] = None,
795
- exclude_paths: Optional[List[str]] = None,
796
- max_depth: Optional[int] = None,
797
- max_discovery_depth: Optional[int] = None,
798
- limit: Optional[int] = None,
799
- allow_backward_links: Optional[bool] = None,
800
- allow_external_links: Optional[bool] = None,
801
- ignore_sitemap: Optional[bool] = None,
802
- scrape_options: Optional[ScrapeOptions] = None,
803
- webhook: Optional[Union[str, WebhookConfig]] = None,
804
- deduplicate_similar_urls: Optional[bool] = None,
805
- ignore_query_parameters: Optional[bool] = None,
806
- regex_on_full_url: Optional[bool] = None,
807
- delay: Optional[int] = None,
808
- idempotency_key: Optional[str] = None,
809
- **kwargs
810
- ) -> CrawlResponse:
811
- """
812
- Start an asynchronous crawl job.
813
-
814
- Args:
815
- url (str): Target URL to start crawling from
816
- include_paths (Optional[List[str]]): Patterns of URLs to include
817
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
818
- max_depth (Optional[int]): Maximum crawl depth
819
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
820
- limit (Optional[int]): Maximum pages to crawl
821
- allow_backward_links (Optional[bool]): Follow parent directory links
822
- allow_external_links (Optional[bool]): Follow external domain links
823
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
824
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
825
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
826
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
827
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
828
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
829
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
830
- **kwargs: Additional parameters to pass to the API
831
-
832
- Returns:
833
- CrawlResponse with:
834
- * success - Whether crawl started successfully
835
- * id - Unique identifier for the crawl job
836
- * url - Status check URL for the crawl
837
- * error - Error message if start failed
838
-
839
- Raises:
840
- Exception: If crawl initiation fails
841
- """
842
- # Validate any additional kwargs
843
- self._validate_kwargs(kwargs, "async_crawl_url")
844
-
845
- crawl_params = {}
846
-
847
- # Add individual parameters
848
- if include_paths is not None:
849
- crawl_params['includePaths'] = include_paths
850
- if exclude_paths is not None:
851
- crawl_params['excludePaths'] = exclude_paths
852
- if max_depth is not None:
853
- crawl_params['maxDepth'] = max_depth
854
- if max_discovery_depth is not None:
855
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
856
- if limit is not None:
857
- crawl_params['limit'] = limit
858
- if allow_backward_links is not None:
859
- crawl_params['allowBackwardLinks'] = allow_backward_links
860
- if allow_external_links is not None:
861
- crawl_params['allowExternalLinks'] = allow_external_links
862
- if ignore_sitemap is not None:
863
- crawl_params['ignoreSitemap'] = ignore_sitemap
864
- if scrape_options is not None:
865
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
866
- if webhook is not None:
867
- crawl_params['webhook'] = webhook
868
- if deduplicate_similar_urls is not None:
869
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
870
- if ignore_query_parameters is not None:
871
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
872
- if regex_on_full_url is not None:
873
- crawl_params['regexOnFullURL'] = regex_on_full_url
874
- if delay is not None:
875
- crawl_params['delay'] = delay
876
-
877
- # Add any additional kwargs
878
- crawl_params.update(kwargs)
879
-
880
- # Create final params object
881
- final_params = CrawlParams(**crawl_params)
882
- params_dict = final_params.dict(exclude_none=True)
883
- params_dict['url'] = url
884
- params_dict['origin'] = f"python-sdk@{version}"
885
-
886
- # Make request
887
- headers = self._prepare_headers(idempotency_key)
888
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
889
-
890
- if response.status_code == 200:
891
- try:
892
- return CrawlResponse(**response.json())
893
- except:
894
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
895
- else:
896
- self._handle_error(response, 'start crawl job')
897
-
898
- def check_crawl_status(self, id: str) -> CrawlStatusResponse:
899
- """
900
- Check the status and results of a crawl job.
901
-
902
- Args:
903
- id: Unique identifier for the crawl job
904
-
905
- Returns:
906
- CrawlStatusResponse containing:
907
-
908
- Status Information:
909
- * status - Current state (scraping/completed/failed/cancelled)
910
- * completed - Number of pages crawled
911
- * total - Total pages to crawl
912
- * creditsUsed - API credits consumed
913
- * expiresAt - Data expiration timestamp
914
-
915
- Results:
916
- * data - List of crawled documents
917
- * next - URL for next page of results (if paginated)
918
- * success - Whether status check succeeded
919
- * error - Error message if failed
920
-
921
- Raises:
922
- Exception: If status check fails
923
- """
924
- endpoint = f'/v1/crawl/{id}'
925
-
926
- headers = self._prepare_headers()
927
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
928
- if response.status_code == 200:
929
- try:
930
- status_data = response.json()
931
- except:
932
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
933
- if status_data['status'] == 'completed':
934
- if 'data' in status_data:
935
- data = status_data['data']
936
- while 'next' in status_data:
937
- if len(status_data['data']) == 0:
938
- break
939
- next_url = status_data.get('next')
940
- if not next_url:
941
- logger.warning("Expected 'next' URL is missing.")
942
- break
943
- try:
944
- status_response = self._get_request(next_url, headers)
945
- if status_response.status_code != 200:
946
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
947
- break
948
- try:
949
- next_data = status_response.json()
950
- except:
951
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
952
- data.extend(next_data.get('data', []))
953
- status_data = next_data
954
- except Exception as e:
955
- logger.error(f"Error during pagination request: {e}")
956
- break
957
- status_data['data'] = data
958
-
959
- response = {
960
- 'status': status_data.get('status'),
961
- 'total': status_data.get('total'),
962
- 'completed': status_data.get('completed'),
963
- 'creditsUsed': status_data.get('creditsUsed'),
964
- 'expiresAt': status_data.get('expiresAt'),
965
- 'data': status_data.get('data')
966
- }
967
-
968
- if 'error' in status_data:
969
- response['error'] = status_data['error']
970
-
971
- if 'next' in status_data:
972
- response['next'] = status_data['next']
973
-
974
- return CrawlStatusResponse(
975
- success=False if 'error' in status_data else True,
976
- **response
977
- )
978
- else:
979
- self._handle_error(response, 'check crawl status')
980
-
981
- def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
982
- """
983
- Returns information about crawl errors.
984
-
985
- Args:
986
- id (str): The ID of the crawl job
987
-
988
- Returns:
989
- CrawlErrorsResponse containing:
990
- * errors (List[Dict[str, str]]): List of errors with fields:
991
- - id (str): Error ID
992
- - timestamp (str): When the error occurred
993
- - url (str): URL that caused the error
994
- - error (str): Error message
995
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
996
-
997
- Raises:
998
- Exception: If error check fails
999
- """
1000
- headers = self._prepare_headers()
1001
- response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1002
- if response.status_code == 200:
1003
- try:
1004
- return CrawlErrorsResponse(**response.json())
1005
- except:
1006
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1007
- else:
1008
- self._handle_error(response, "check crawl errors")
1009
-
1010
- def cancel_crawl(self, id: str) -> Dict[str, Any]:
1011
- """
1012
- Cancel an asynchronous crawl job.
1013
-
1014
- Args:
1015
- id (str): The ID of the crawl job to cancel
1016
-
1017
- Returns:
1018
- Dict[str, Any] containing:
1019
- * success (bool): Whether cancellation was successful
1020
- * error (str, optional): Error message if cancellation failed
1021
-
1022
- Raises:
1023
- Exception: If cancellation fails
1024
- """
1025
- headers = self._prepare_headers()
1026
- response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1027
- if response.status_code == 200:
1028
- try:
1029
- return response.json()
1030
- except:
1031
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1032
- else:
1033
- self._handle_error(response, "cancel crawl job")
1034
-
1035
- def crawl_url_and_watch(
1036
- self,
1037
- url: str,
1038
- *,
1039
- include_paths: Optional[List[str]] = None,
1040
- exclude_paths: Optional[List[str]] = None,
1041
- max_depth: Optional[int] = None,
1042
- max_discovery_depth: Optional[int] = None,
1043
- limit: Optional[int] = None,
1044
- allow_backward_links: Optional[bool] = None,
1045
- allow_external_links: Optional[bool] = None,
1046
- ignore_sitemap: Optional[bool] = None,
1047
- scrape_options: Optional[ScrapeOptions] = None,
1048
- webhook: Optional[Union[str, WebhookConfig]] = None,
1049
- deduplicate_similar_urls: Optional[bool] = None,
1050
- ignore_query_parameters: Optional[bool] = None,
1051
- regex_on_full_url: Optional[bool] = None,
1052
- idempotency_key: Optional[str] = None,
1053
- **kwargs
1054
- ) -> 'CrawlWatcher':
1055
- """
1056
- Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1057
-
1058
- Args:
1059
- url (str): Target URL to start crawling from
1060
- include_paths (Optional[List[str]]): Patterns of URLs to include
1061
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1062
- max_depth (Optional[int]): Maximum crawl depth
1063
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1064
- limit (Optional[int]): Maximum pages to crawl
1065
- allow_backward_links (Optional[bool]): Follow parent directory links
1066
- allow_external_links (Optional[bool]): Follow external domain links
1067
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1068
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1069
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1070
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1071
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
1072
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
1073
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1074
- **kwargs: Additional parameters to pass to the API
1075
-
1076
- Returns:
1077
- CrawlWatcher: An instance to monitor the crawl job via WebSocket
1078
-
1079
- Raises:
1080
- Exception: If crawl job fails to start
1081
- """
1082
- crawl_response = self.async_crawl_url(
1083
- url,
1084
- include_paths=include_paths,
1085
- exclude_paths=exclude_paths,
1086
- max_depth=max_depth,
1087
- max_discovery_depth=max_discovery_depth,
1088
- limit=limit,
1089
- allow_backward_links=allow_backward_links,
1090
- allow_external_links=allow_external_links,
1091
- ignore_sitemap=ignore_sitemap,
1092
- scrape_options=scrape_options,
1093
- webhook=webhook,
1094
- deduplicate_similar_urls=deduplicate_similar_urls,
1095
- ignore_query_parameters=ignore_query_parameters,
1096
- regex_on_full_url=regex_on_full_url,
1097
- idempotency_key=idempotency_key,
1098
- **kwargs
1099
- )
1100
- if crawl_response.success and crawl_response.id:
1101
- return CrawlWatcher(crawl_response.id, self)
1102
- else:
1103
- raise Exception("Crawl job failed to start")
1104
-
1105
- def map_url(
1106
- self,
1107
- url: str,
1108
- *,
1109
- search: Optional[str] = None,
1110
- ignore_sitemap: Optional[bool] = None,
1111
- include_subdomains: Optional[bool] = None,
1112
- sitemap_only: Optional[bool] = None,
1113
- limit: Optional[int] = None,
1114
- timeout: Optional[int] = None,
1115
- use_index: Optional[bool] = None,
1116
- **kwargs) -> MapResponse:
1117
- """
1118
- Map and discover links from a URL.
1119
-
1120
- Args:
1121
- url (str): Target URL to map
1122
- search (Optional[str]): Filter pattern for URLs
1123
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1124
- include_subdomains (Optional[bool]): Include subdomain links
1125
- sitemap_only (Optional[bool]): Only use sitemap.xml
1126
- limit (Optional[int]): Maximum URLs to return
1127
- timeout (Optional[int]): Request timeout in milliseconds
1128
- **kwargs: Additional parameters to pass to the API
1129
-
1130
- Returns:
1131
- MapResponse: Response containing:
1132
- * success (bool): Whether request succeeded
1133
- * links (List[str]): Discovered URLs
1134
- * error (Optional[str]): Error message if any
1135
-
1136
- Raises:
1137
- Exception: If mapping fails or response cannot be parsed
1138
- """
1139
- # Validate any additional kwargs
1140
- self._validate_kwargs(kwargs, "map_url")
1141
-
1142
- # Build map parameters
1143
- map_params = {}
1144
-
1145
- # Add individual parameters
1146
- if search is not None:
1147
- map_params['search'] = search
1148
- if ignore_sitemap is not None:
1149
- map_params['ignoreSitemap'] = ignore_sitemap
1150
- if include_subdomains is not None:
1151
- map_params['includeSubdomains'] = include_subdomains
1152
- if sitemap_only is not None:
1153
- map_params['sitemapOnly'] = sitemap_only
1154
- if limit is not None:
1155
- map_params['limit'] = limit
1156
- if timeout is not None:
1157
- map_params['timeout'] = timeout
1158
- if use_index is not None:
1159
- map_params['useIndex'] = use_index
1160
-
1161
- # Add any additional kwargs
1162
- map_params.update(kwargs)
1163
-
1164
- # Create final params object
1165
- final_params = MapParams(**map_params)
1166
- params_dict = final_params.dict(exclude_none=True)
1167
- params_dict['url'] = url
1168
- params_dict['origin'] = f"python-sdk@{version}"
1169
-
1170
- # Make request
1171
- response = requests.post(
1172
- f"{self.api_url}/v1/map",
1173
- headers={"Authorization": f"Bearer {self.api_key}"},
1174
- json=params_dict
1175
- )
1176
-
1177
- if response.status_code == 200:
1178
- try:
1179
- response_json = response.json()
1180
- if response_json.get('success') and 'links' in response_json:
1181
- return MapResponse(**response_json)
1182
- elif "error" in response_json:
1183
- raise Exception(f'Map failed. Error: {response_json["error"]}')
1184
- else:
1185
- raise Exception(f'Map failed. Error: {response_json}')
1186
- except ValueError:
1187
- raise Exception('Failed to parse Firecrawl response as JSON.')
1188
- else:
1189
- self._handle_error(response, 'map')
1190
-
1191
- def batch_scrape_urls(
1192
- self,
1193
- urls: List[str],
1194
- *,
1195
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1196
- headers: Optional[Dict[str, str]] = None,
1197
- include_tags: Optional[List[str]] = None,
1198
- exclude_tags: Optional[List[str]] = None,
1199
- only_main_content: Optional[bool] = None,
1200
- wait_for: Optional[int] = None,
1201
- timeout: Optional[int] = None,
1202
- location: Optional[LocationConfig] = None,
1203
- mobile: Optional[bool] = None,
1204
- skip_tls_verification: Optional[bool] = None,
1205
- remove_base64_images: Optional[bool] = None,
1206
- block_ads: Optional[bool] = None,
1207
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1208
- extract: Optional[JsonConfig] = None,
1209
- json_options: Optional[JsonConfig] = None,
1210
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1211
- agent: Optional[AgentOptions] = None,
1212
- poll_interval: Optional[int] = 2,
1213
- idempotency_key: Optional[str] = None,
1214
- **kwargs
1215
- ) -> BatchScrapeStatusResponse:
1216
- """
1217
- Batch scrape multiple URLs and monitor until completion.
1218
-
1219
- Args:
1220
- urls (List[str]): URLs to scrape
1221
- formats (Optional[List[Literal]]): Content formats to retrieve
1222
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1223
- include_tags (Optional[List[str]]): HTML tags to include
1224
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1225
- only_main_content (Optional[bool]): Extract main content only
1226
- wait_for (Optional[int]): Wait time in milliseconds
1227
- timeout (Optional[int]): Request timeout in milliseconds
1228
- location (Optional[LocationConfig]): Location configuration
1229
- mobile (Optional[bool]): Use mobile user agent
1230
- skip_tls_verification (Optional[bool]): Skip TLS verification
1231
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1232
- block_ads (Optional[bool]): Block advertisements
1233
- proxy (Optional[Literal]): Proxy type to use
1234
- extract (Optional[JsonConfig]): Content extraction config
1235
- json_options (Optional[JsonConfig]): JSON extraction config
1236
- actions (Optional[List[Union]]): Actions to perform
1237
- agent (Optional[AgentOptions]): Agent configuration
1238
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
1239
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1240
- **kwargs: Additional parameters to pass to the API
1241
-
1242
- Returns:
1243
- BatchScrapeStatusResponse with:
1244
- * Scraping status and progress
1245
- * Scraped content for each URL
1246
- * Success/error information
1247
-
1248
- Raises:
1249
- Exception: If batch scrape fails
1250
- """
1251
- # Validate any additional kwargs
1252
- self._validate_kwargs(kwargs, "batch_scrape_urls")
1253
-
1254
- scrape_params = {}
1255
-
1256
- # Add individual parameters
1257
- if formats is not None:
1258
- scrape_params['formats'] = formats
1259
- if headers is not None:
1260
- scrape_params['headers'] = headers
1261
- if include_tags is not None:
1262
- scrape_params['includeTags'] = include_tags
1263
- if exclude_tags is not None:
1264
- scrape_params['excludeTags'] = exclude_tags
1265
- if only_main_content is not None:
1266
- scrape_params['onlyMainContent'] = only_main_content
1267
- if wait_for is not None:
1268
- scrape_params['waitFor'] = wait_for
1269
- if timeout is not None:
1270
- scrape_params['timeout'] = timeout
1271
- if location is not None:
1272
- scrape_params['location'] = location.dict(exclude_none=True)
1273
- if mobile is not None:
1274
- scrape_params['mobile'] = mobile
1275
- if skip_tls_verification is not None:
1276
- scrape_params['skipTlsVerification'] = skip_tls_verification
1277
- if remove_base64_images is not None:
1278
- scrape_params['removeBase64Images'] = remove_base64_images
1279
- if block_ads is not None:
1280
- scrape_params['blockAds'] = block_ads
1281
- if proxy is not None:
1282
- scrape_params['proxy'] = proxy
1283
- if extract is not None:
1284
- extract = self._ensure_schema_dict(extract)
1285
- if isinstance(extract, dict) and "schema" in extract:
1286
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1287
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1288
- if json_options is not None:
1289
- json_options = self._ensure_schema_dict(json_options)
1290
- if isinstance(json_options, dict) and "schema" in json_options:
1291
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1292
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1293
- if actions is not None:
1294
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1295
- if agent is not None:
1296
- scrape_params['agent'] = agent.dict(exclude_none=True)
1297
-
1298
- # Add any additional kwargs
1299
- scrape_params.update(kwargs)
1300
-
1301
- # Create final params object
1302
- final_params = ScrapeParams(**scrape_params)
1303
- params_dict = final_params.dict(exclude_none=True)
1304
- params_dict['urls'] = urls
1305
- params_dict['origin'] = f"python-sdk@{version}"
1306
-
1307
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1308
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1309
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1310
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1311
-
1312
- # Make request
1313
- headers = self._prepare_headers(idempotency_key)
1314
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1315
-
1316
- if response.status_code == 200:
1317
- try:
1318
- id = response.json().get('id')
1319
- except:
1320
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1321
- return self._monitor_job_status(id, headers, poll_interval)
1322
- else:
1323
- self._handle_error(response, 'start batch scrape job')
1324
-
1325
- def async_batch_scrape_urls(
1326
- self,
1327
- urls: List[str],
1328
- *,
1329
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1330
- headers: Optional[Dict[str, str]] = None,
1331
- include_tags: Optional[List[str]] = None,
1332
- exclude_tags: Optional[List[str]] = None,
1333
- only_main_content: Optional[bool] = None,
1334
- wait_for: Optional[int] = None,
1335
- timeout: Optional[int] = None,
1336
- location: Optional[LocationConfig] = None,
1337
- mobile: Optional[bool] = None,
1338
- skip_tls_verification: Optional[bool] = None,
1339
- remove_base64_images: Optional[bool] = None,
1340
- block_ads: Optional[bool] = None,
1341
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1342
- extract: Optional[JsonConfig] = None,
1343
- json_options: Optional[JsonConfig] = None,
1344
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1345
- agent: Optional[AgentOptions] = None,
1346
- idempotency_key: Optional[str] = None,
1347
- **kwargs
1348
- ) -> BatchScrapeResponse:
1349
- """
1350
- Initiate a batch scrape job asynchronously.
1351
-
1352
- Args:
1353
- urls (List[str]): URLs to scrape
1354
- formats (Optional[List[Literal]]): Content formats to retrieve
1355
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1356
- include_tags (Optional[List[str]]): HTML tags to include
1357
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1358
- only_main_content (Optional[bool]): Extract main content only
1359
- wait_for (Optional[int]): Wait time in milliseconds
1360
- timeout (Optional[int]): Request timeout in milliseconds
1361
- location (Optional[LocationConfig]): Location configuration
1362
- mobile (Optional[bool]): Use mobile user agent
1363
- skip_tls_verification (Optional[bool]): Skip TLS verification
1364
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1365
- block_ads (Optional[bool]): Block advertisements
1366
- proxy (Optional[Literal]): Proxy type to use
1367
- extract (Optional[JsonConfig]): Content extraction config
1368
- json_options (Optional[JsonConfig]): JSON extraction config
1369
- actions (Optional[List[Union]]): Actions to perform
1370
- agent (Optional[AgentOptions]): Agent configuration
1371
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1372
- **kwargs: Additional parameters to pass to the API
1373
-
1374
- Returns:
1375
- BatchScrapeResponse with:
1376
- * success - Whether job started successfully
1377
- * id - Unique identifier for the job
1378
- * url - Status check URL
1379
- * error - Error message if start failed
1380
-
1381
- Raises:
1382
- Exception: If job initiation fails
1383
- """
1384
- # Validate any additional kwargs
1385
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1386
-
1387
- scrape_params = {}
1388
-
1389
- # Add individual parameters
1390
- if formats is not None:
1391
- scrape_params['formats'] = formats
1392
- if headers is not None:
1393
- scrape_params['headers'] = headers
1394
- if include_tags is not None:
1395
- scrape_params['includeTags'] = include_tags
1396
- if exclude_tags is not None:
1397
- scrape_params['excludeTags'] = exclude_tags
1398
- if only_main_content is not None:
1399
- scrape_params['onlyMainContent'] = only_main_content
1400
- if wait_for is not None:
1401
- scrape_params['waitFor'] = wait_for
1402
- if timeout is not None:
1403
- scrape_params['timeout'] = timeout
1404
- if location is not None:
1405
- scrape_params['location'] = location.dict(exclude_none=True)
1406
- if mobile is not None:
1407
- scrape_params['mobile'] = mobile
1408
- if skip_tls_verification is not None:
1409
- scrape_params['skipTlsVerification'] = skip_tls_verification
1410
- if remove_base64_images is not None:
1411
- scrape_params['removeBase64Images'] = remove_base64_images
1412
- if block_ads is not None:
1413
- scrape_params['blockAds'] = block_ads
1414
- if proxy is not None:
1415
- scrape_params['proxy'] = proxy
1416
- if extract is not None:
1417
- extract = self._ensure_schema_dict(extract)
1418
- if isinstance(extract, dict) and "schema" in extract:
1419
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1420
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1421
- if json_options is not None:
1422
- json_options = self._ensure_schema_dict(json_options)
1423
- if isinstance(json_options, dict) and "schema" in json_options:
1424
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1425
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1426
- if actions is not None:
1427
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1428
- if agent is not None:
1429
- scrape_params['agent'] = agent.dict(exclude_none=True)
1430
-
1431
- # Add any additional kwargs
1432
- scrape_params.update(kwargs)
1433
-
1434
- # Create final params object
1435
- final_params = ScrapeParams(**scrape_params)
1436
- params_dict = final_params.dict(exclude_none=True)
1437
- params_dict['urls'] = urls
1438
- params_dict['origin'] = f"python-sdk@{version}"
1439
-
1440
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1441
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1442
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1443
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1444
-
1445
- # Make request
1446
- headers = self._prepare_headers(idempotency_key)
1447
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1448
-
1449
- if response.status_code == 200:
1450
- try:
1451
- return BatchScrapeResponse(**response.json())
1452
- except:
1453
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1454
- else:
1455
- self._handle_error(response, 'start batch scrape job')
1456
-
1457
- def batch_scrape_urls_and_watch(
1458
- self,
1459
- urls: List[str],
1460
- *,
1461
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1462
- headers: Optional[Dict[str, str]] = None,
1463
- include_tags: Optional[List[str]] = None,
1464
- exclude_tags: Optional[List[str]] = None,
1465
- only_main_content: Optional[bool] = None,
1466
- wait_for: Optional[int] = None,
1467
- timeout: Optional[int] = None,
1468
- location: Optional[LocationConfig] = None,
1469
- mobile: Optional[bool] = None,
1470
- skip_tls_verification: Optional[bool] = None,
1471
- remove_base64_images: Optional[bool] = None,
1472
- block_ads: Optional[bool] = None,
1473
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1474
- extract: Optional[JsonConfig] = None,
1475
- json_options: Optional[JsonConfig] = None,
1476
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1477
- agent: Optional[AgentOptions] = None,
1478
- idempotency_key: Optional[str] = None,
1479
- **kwargs
1480
- ) -> 'CrawlWatcher':
1481
- """
1482
- Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1483
-
1484
- Args:
1485
- urls (List[str]): URLs to scrape
1486
- formats (Optional[List[Literal]]): Content formats to retrieve
1487
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1488
- include_tags (Optional[List[str]]): HTML tags to include
1489
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1490
- only_main_content (Optional[bool]): Extract main content only
1491
- wait_for (Optional[int]): Wait time in milliseconds
1492
- timeout (Optional[int]): Request timeout in milliseconds
1493
- location (Optional[LocationConfig]): Location configuration
1494
- mobile (Optional[bool]): Use mobile user agent
1495
- skip_tls_verification (Optional[bool]): Skip TLS verification
1496
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1497
- block_ads (Optional[bool]): Block advertisements
1498
- proxy (Optional[Literal]): Proxy type to use
1499
- extract (Optional[JsonConfig]): Content extraction config
1500
- json_options (Optional[JsonConfig]): JSON extraction config
1501
- actions (Optional[List[Union]]): Actions to perform
1502
- agent (Optional[AgentOptions]): Agent configuration
1503
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1504
- **kwargs: Additional parameters to pass to the API
1505
-
1506
- Returns:
1507
- CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1508
-
1509
- Raises:
1510
- Exception: If batch scrape job fails to start
1511
- """
1512
- # Validate any additional kwargs
1513
- self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1514
-
1515
- scrape_params = {}
1516
-
1517
- # Add individual parameters
1518
- if formats is not None:
1519
- scrape_params['formats'] = formats
1520
- if headers is not None:
1521
- scrape_params['headers'] = headers
1522
- if include_tags is not None:
1523
- scrape_params['includeTags'] = include_tags
1524
- if exclude_tags is not None:
1525
- scrape_params['excludeTags'] = exclude_tags
1526
- if only_main_content is not None:
1527
- scrape_params['onlyMainContent'] = only_main_content
1528
- if wait_for is not None:
1529
- scrape_params['waitFor'] = wait_for
1530
- if timeout is not None:
1531
- scrape_params['timeout'] = timeout
1532
- if location is not None:
1533
- scrape_params['location'] = location.dict(exclude_none=True)
1534
- if mobile is not None:
1535
- scrape_params['mobile'] = mobile
1536
- if skip_tls_verification is not None:
1537
- scrape_params['skipTlsVerification'] = skip_tls_verification
1538
- if remove_base64_images is not None:
1539
- scrape_params['removeBase64Images'] = remove_base64_images
1540
- if block_ads is not None:
1541
- scrape_params['blockAds'] = block_ads
1542
- if proxy is not None:
1543
- scrape_params['proxy'] = proxy
1544
- if extract is not None:
1545
- extract = self._ensure_schema_dict(extract)
1546
- if isinstance(extract, dict) and "schema" in extract:
1547
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1548
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1549
- if json_options is not None:
1550
- json_options = self._ensure_schema_dict(json_options)
1551
- if isinstance(json_options, dict) and "schema" in json_options:
1552
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1553
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1554
- if actions is not None:
1555
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1556
- if agent is not None:
1557
- scrape_params['agent'] = agent.dict(exclude_none=True)
1558
-
1559
- # Add any additional kwargs
1560
- scrape_params.update(kwargs)
1561
-
1562
- # Create final params object
1563
- final_params = ScrapeParams(**scrape_params)
1564
- params_dict = final_params.dict(exclude_none=True)
1565
- params_dict['urls'] = urls
1566
- params_dict['origin'] = f"python-sdk@{version}"
1567
-
1568
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1569
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1570
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1571
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1572
-
1573
- # Make request
1574
- headers = self._prepare_headers(idempotency_key)
1575
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1576
-
1577
- if response.status_code == 200:
1578
- try:
1579
- crawl_response = BatchScrapeResponse(**response.json())
1580
- if crawl_response.success and crawl_response.id:
1581
- return CrawlWatcher(crawl_response.id, self)
1582
- else:
1583
- raise Exception("Batch scrape job failed to start")
1584
- except:
1585
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1586
- else:
1587
- self._handle_error(response, 'start batch scrape job')
1588
-
1589
- def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1590
- """
1591
- Check the status of a batch scrape job using the Firecrawl API.
1592
-
1593
- Args:
1594
- id (str): The ID of the batch scrape job.
1595
-
1596
- Returns:
1597
- BatchScrapeStatusResponse: The status of the batch scrape job.
1598
-
1599
- Raises:
1600
- Exception: If the status check request fails.
1601
- """
1602
- endpoint = f'/v1/batch/scrape/{id}'
1603
-
1604
- headers = self._prepare_headers()
1605
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
1606
- if response.status_code == 200:
1607
- try:
1608
- status_data = response.json()
1609
- except:
1610
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1611
- if status_data['status'] == 'completed':
1612
- if 'data' in status_data:
1613
- data = status_data['data']
1614
- while 'next' in status_data:
1615
- if len(status_data['data']) == 0:
1616
- break
1617
- next_url = status_data.get('next')
1618
- if not next_url:
1619
- logger.warning("Expected 'next' URL is missing.")
1620
- break
1621
- try:
1622
- status_response = self._get_request(next_url, headers)
1623
- if status_response.status_code != 200:
1624
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
1625
- break
1626
- try:
1627
- next_data = status_response.json()
1628
- except:
1629
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1630
- data.extend(next_data.get('data', []))
1631
- status_data = next_data
1632
- except Exception as e:
1633
- logger.error(f"Error during pagination request: {e}")
1634
- break
1635
- status_data['data'] = data
1636
-
1637
- return BatchScrapeStatusResponse(**{
1638
- 'success': False if 'error' in status_data else True,
1639
- 'status': status_data.get('status'),
1640
- 'total': status_data.get('total'),
1641
- 'completed': status_data.get('completed'),
1642
- 'creditsUsed': status_data.get('creditsUsed'),
1643
- 'expiresAt': status_data.get('expiresAt'),
1644
- 'data': status_data.get('data'),
1645
- 'next': status_data.get('next'),
1646
- 'error': status_data.get('error')
1647
- })
1648
- else:
1649
- self._handle_error(response, 'check batch scrape status')
1650
-
1651
- def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1652
- """
1653
- Returns information about batch scrape errors.
1654
-
1655
- Args:
1656
- id (str): The ID of the crawl job.
1657
-
1658
- Returns:
1659
- CrawlErrorsResponse containing:
1660
- * errors (List[Dict[str, str]]): List of errors with fields:
1661
- * id (str): Error ID
1662
- * timestamp (str): When the error occurred
1663
- * url (str): URL that caused the error
1664
- * error (str): Error message
1665
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1666
-
1667
- Raises:
1668
- Exception: If the error check request fails
1669
- """
1670
- headers = self._prepare_headers()
1671
- response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1672
- if response.status_code == 200:
1673
- try:
1674
- return CrawlErrorsResponse(**response.json())
1675
- except:
1676
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1677
- else:
1678
- self._handle_error(response, "check batch scrape errors")
1679
-
1680
- def extract(
1681
- self,
1682
- urls: Optional[List[str]] = None,
1683
- *,
1684
- prompt: Optional[str] = None,
1685
- schema: Optional[Any] = None,
1686
- system_prompt: Optional[str] = None,
1687
- allow_external_links: Optional[bool] = False,
1688
- enable_web_search: Optional[bool] = False,
1689
- show_sources: Optional[bool] = False,
1690
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1691
- """
1692
- Extract structured information from URLs.
1693
-
1694
- Args:
1695
- urls (Optional[List[str]]): URLs to extract from
1696
- prompt (Optional[str]): Custom extraction prompt
1697
- schema (Optional[Any]): JSON schema/Pydantic model
1698
- system_prompt (Optional[str]): System context
1699
- allow_external_links (Optional[bool]): Follow external links
1700
- enable_web_search (Optional[bool]): Enable web search
1701
- show_sources (Optional[bool]): Include source URLs
1702
- agent (Optional[Dict[str, Any]]): Agent configuration
1703
-
1704
- Returns:
1705
- ExtractResponse[Any] with:
1706
- * success (bool): Whether request succeeded
1707
- * data (Optional[Any]): Extracted data matching schema
1708
- * error (Optional[str]): Error message if any
1709
-
1710
- Raises:
1711
- ValueError: If prompt/schema missing or extraction fails
1712
- """
1713
- headers = self._prepare_headers()
1714
-
1715
- if not prompt and not schema:
1716
- raise ValueError("Either prompt or schema is required")
1717
-
1718
- if not urls and not prompt:
1719
- raise ValueError("Either urls or prompt is required")
1720
-
1721
- if schema:
1722
- schema = self._ensure_schema_dict(schema)
1723
-
1724
- request_data = {
1725
- 'urls': urls or [],
1726
- 'allowExternalLinks': allow_external_links,
1727
- 'enableWebSearch': enable_web_search,
1728
- 'showSources': show_sources,
1729
- 'schema': schema,
1730
- 'origin': f'python-sdk@{get_version()}'
1731
- }
1732
-
1733
- # Only add prompt and systemPrompt if they exist
1734
- if prompt:
1735
- request_data['prompt'] = prompt
1736
- if system_prompt:
1737
- request_data['systemPrompt'] = system_prompt
1738
-
1739
- if agent:
1740
- request_data['agent'] = agent
1741
-
1742
- try:
1743
- # Send the initial extract request
1744
- response = self._post_request(
1745
- f'{self.api_url}/v1/extract',
1746
- request_data,
1747
- headers
1748
- )
1749
- if response.status_code == 200:
1750
- try:
1751
- data = response.json()
1752
- except:
1753
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1754
- if data['success']:
1755
- job_id = data.get('id')
1756
- if not job_id:
1757
- raise Exception('Job ID not returned from extract request.')
1758
-
1759
- # Poll for the extract status
1760
- while True:
1761
- status_response = self._get_request(
1762
- f'{self.api_url}/v1/extract/{job_id}',
1763
- headers
1764
- )
1765
- if status_response.status_code == 200:
1766
- try:
1767
- status_data = status_response.json()
1768
- except:
1769
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1770
- if status_data['status'] == 'completed':
1771
- return ExtractResponse(**status_data)
1772
- elif status_data['status'] in ['failed', 'cancelled']:
1773
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1774
- else:
1775
- self._handle_error(status_response, "extract-status")
1776
-
1777
- time.sleep(2) # Polling interval
1778
- else:
1779
- raise Exception(f'Failed to extract. Error: {data["error"]}')
1780
- else:
1781
- self._handle_error(response, "extract")
1782
- except Exception as e:
1783
- raise ValueError(str(e), 500)
1784
-
1785
- return ExtractResponse(success=False, error="Internal server error.")
1786
-
1787
- def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1788
- """
1789
- Retrieve the status of an extract job.
1790
-
1791
- Args:
1792
- job_id (str): The ID of the extract job.
1793
-
1794
- Returns:
1795
- ExtractResponse[Any]: The status of the extract job.
1796
-
1797
- Raises:
1798
- ValueError: If there is an error retrieving the status.
1799
- """
1800
- headers = self._prepare_headers()
1801
- try:
1802
- response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1803
- if response.status_code == 200:
1804
- try:
1805
- return ExtractResponse(**response.json())
1806
- except:
1807
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1808
- else:
1809
- self._handle_error(response, "get extract status")
1810
- except Exception as e:
1811
- raise ValueError(str(e), 500)
1812
-
1813
- def async_extract(
1814
- self,
1815
- urls: Optional[List[str]] = None,
1816
- *,
1817
- prompt: Optional[str] = None,
1818
- schema: Optional[Any] = None,
1819
- system_prompt: Optional[str] = None,
1820
- allow_external_links: Optional[bool] = False,
1821
- enable_web_search: Optional[bool] = False,
1822
- show_sources: Optional[bool] = False,
1823
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1824
- """
1825
- Initiate an asynchronous extract job.
1826
-
1827
- Args:
1828
- urls (List[str]): URLs to extract information from
1829
- prompt (Optional[str]): Custom extraction prompt
1830
- schema (Optional[Any]): JSON schema/Pydantic model
1831
- system_prompt (Optional[str]): System context
1832
- allow_external_links (Optional[bool]): Follow external links
1833
- enable_web_search (Optional[bool]): Enable web search
1834
- show_sources (Optional[bool]): Include source URLs
1835
- agent (Optional[Dict[str, Any]]): Agent configuration
1836
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1837
-
1838
- Returns:
1839
- ExtractResponse[Any] with:
1840
- * success (bool): Whether request succeeded
1841
- * data (Optional[Any]): Extracted data matching schema
1842
- * error (Optional[str]): Error message if any
1843
-
1844
- Raises:
1845
- ValueError: If job initiation fails
1846
- """
1847
- headers = self._prepare_headers()
1848
-
1849
- schema = schema
1850
- if schema:
1851
- schema = self._ensure_schema_dict(schema)
1852
-
1853
- request_data = {
1854
- 'urls': urls,
1855
- 'allowExternalLinks': allow_external_links,
1856
- 'enableWebSearch': enable_web_search,
1857
- 'showSources': show_sources,
1858
- 'schema': schema,
1859
- 'origin': f'python-sdk@{version}'
1860
- }
1861
-
1862
- if prompt:
1863
- request_data['prompt'] = prompt
1864
- if system_prompt:
1865
- request_data['systemPrompt'] = system_prompt
1866
- if agent:
1867
- request_data['agent'] = agent
1868
-
1869
- try:
1870
- response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1871
- if response.status_code == 200:
1872
- try:
1873
- return ExtractResponse(**response.json())
1874
- except:
1875
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1876
- else:
1877
- self._handle_error(response, "async extract")
1878
- except Exception as e:
1879
- raise ValueError(str(e), 500)
1880
-
1881
- def generate_llms_text(
1882
- self,
1883
- url: str,
1884
- *,
1885
- max_urls: Optional[int] = None,
1886
- show_full_text: Optional[bool] = None,
1887
- cache: Optional[bool] = None,
1888
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1889
- """
1890
- Generate LLMs.txt for a given URL and poll until completion.
1891
-
1892
- Args:
1893
- url (str): Target URL to generate LLMs.txt from
1894
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1895
- show_full_text (Optional[bool]): Include full text in output (default: False)
1896
- cache (Optional[bool]): Whether to use cached content if available (default: True)
1897
- experimental_stream (Optional[bool]): Enable experimental streaming
1898
-
1899
- Returns:
1900
- GenerateLLMsTextStatusResponse with:
1901
- * Generated LLMs.txt content
1902
- * Full version if requested
1903
- * Generation status
1904
- * Success/error information
1905
-
1906
- Raises:
1907
- Exception: If generation fails
1908
- """
1909
- params = GenerateLLMsTextParams(
1910
- maxUrls=max_urls,
1911
- showFullText=show_full_text,
1912
- cache=cache,
1913
- __experimental_stream=experimental_stream
1914
- )
1915
-
1916
- response = self.async_generate_llms_text(
1917
- url,
1918
- max_urls=max_urls,
1919
- show_full_text=show_full_text,
1920
- cache=cache,
1921
- experimental_stream=experimental_stream
1922
- )
1923
-
1924
- if not response.success or not response.id:
1925
- return GenerateLLMsTextStatusResponse(
1926
- success=False,
1927
- error='Failed to start LLMs.txt generation',
1928
- status='failed',
1929
- expiresAt=''
1930
- )
1931
-
1932
- job_id = response.id
1933
- while True:
1934
- status = self.check_generate_llms_text_status(job_id)
1935
-
1936
- if status.status == 'completed':
1937
- return status
1938
- elif status.status == 'failed':
1939
- return status
1940
- elif status.status != 'processing':
1941
- return GenerateLLMsTextStatusResponse(
1942
- success=False,
1943
- error='LLMs.txt generation job terminated unexpectedly',
1944
- status='failed',
1945
- expiresAt=''
1946
- )
1947
-
1948
- time.sleep(2) # Polling interval
1949
-
1950
- def async_generate_llms_text(
1951
- self,
1952
- url: str,
1953
- *,
1954
- max_urls: Optional[int] = None,
1955
- show_full_text: Optional[bool] = None,
1956
- cache: Optional[bool] = None,
1957
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1958
- """
1959
- Initiate an asynchronous LLMs.txt generation operation.
1960
-
1961
- Args:
1962
- url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1963
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1964
- show_full_text (Optional[bool]): Include full text in output (default: False)
1965
- cache (Optional[bool]): Whether to use cached content if available (default: True)
1966
- experimental_stream (Optional[bool]): Enable experimental streaming
1967
-
1968
- Returns:
1969
- GenerateLLMsTextResponse: A response containing:
1970
- * success (bool): Whether the generation initiation was successful
1971
- * id (str): The unique identifier for the generation job
1972
- * error (str, optional): Error message if initiation failed
1973
-
1974
- Raises:
1975
- Exception: If the generation job initiation fails.
1976
- """
1977
- params = GenerateLLMsTextParams(
1978
- maxUrls=max_urls,
1979
- showFullText=show_full_text,
1980
- cache=cache,
1981
- __experimental_stream=experimental_stream
1982
- )
1983
-
1984
- headers = self._prepare_headers()
1985
- json_data = {'url': url, **params.dict(exclude_none=True)}
1986
- json_data['origin'] = f"python-sdk@{version}"
1987
-
1988
- try:
1989
- req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1990
- response = req.json()
1991
- print("json_data", json_data)
1992
- print("response", response)
1993
- if response.get('success'):
1994
- try:
1995
- return GenerateLLMsTextResponse(**response)
1996
- except:
1997
- raise Exception('Failed to parse Firecrawl response as JSON.')
1998
- else:
1999
- self._handle_error(response, 'start LLMs.txt generation')
2000
- except Exception as e:
2001
- raise ValueError(str(e))
2002
-
2003
- return GenerateLLMsTextResponse(
2004
- success=False,
2005
- error='Internal server error'
2006
- )
2007
-
2008
- def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2009
- """
2010
- Check the status of a LLMs.txt generation operation.
2011
-
2012
- Args:
2013
- id (str): The unique identifier of the LLMs.txt generation job to check status for.
2014
-
2015
- Returns:
2016
- GenerateLLMsTextStatusResponse: A response containing:
2017
- * success (bool): Whether the generation was successful
2018
- * status (str): Status of generation ("processing", "completed", "failed")
2019
- * data (Dict[str, str], optional): Generated text with fields:
2020
- * llmstxt (str): Generated LLMs.txt content
2021
- * llmsfulltxt (str, optional): Full version if requested
2022
- * error (str, optional): Error message if generation failed
2023
- * expiresAt (str): When the generated data expires
2024
-
2025
- Raises:
2026
- Exception: If the status check fails.
2027
- """
2028
- headers = self._prepare_headers()
2029
- try:
2030
- response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2031
- if response.status_code == 200:
2032
- try:
2033
- json_data = response.json()
2034
- return GenerateLLMsTextStatusResponse(**json_data)
2035
- except Exception as e:
2036
- raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2037
- elif response.status_code == 404:
2038
- raise Exception('LLMs.txt generation job not found')
2039
- else:
2040
- self._handle_error(response, 'check LLMs.txt generation status')
2041
- except Exception as e:
2042
- raise ValueError(str(e))
2043
-
2044
- return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2045
-
2046
- def _prepare_headers(
2047
- self,
2048
- idempotency_key: Optional[str] = None) -> Dict[str, str]:
2049
- """
2050
- Prepare the headers for API requests.
2051
-
2052
- Args:
2053
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2054
-
2055
- Returns:
2056
- Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2057
- """
2058
- if idempotency_key:
2059
- return {
2060
- 'Content-Type': 'application/json',
2061
- 'Authorization': f'Bearer {self.api_key}',
2062
- 'x-idempotency-key': idempotency_key
2063
- }
2064
-
2065
- return {
2066
- 'Content-Type': 'application/json',
2067
- 'Authorization': f'Bearer {self.api_key}',
2068
- }
2069
-
2070
- def _post_request(
2071
- self,
2072
- url: str,
2073
- data: Dict[str, Any],
2074
- headers: Dict[str, str],
2075
- retries: int = 3,
2076
- backoff_factor: float = 0.5) -> requests.Response:
2077
- """
2078
- Make a POST request with retries.
2079
-
2080
- Args:
2081
- url (str): The URL to send the POST request to.
2082
- data (Dict[str, Any]): The JSON data to include in the POST request.
2083
- headers (Dict[str, str]): The headers to include in the POST request.
2084
- retries (int): Number of retries for the request.
2085
- backoff_factor (float): Backoff factor for retries.
2086
-
2087
- Returns:
2088
- requests.Response: The response from the POST request.
2089
-
2090
- Raises:
2091
- requests.RequestException: If the request fails after the specified retries.
2092
- """
2093
- for attempt in range(retries):
2094
- response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2095
- if response.status_code == 502:
2096
- time.sleep(backoff_factor * (2 ** attempt))
2097
- else:
2098
- return response
2099
- return response
2100
-
2101
- def _get_request(
2102
- self,
2103
- url: str,
2104
- headers: Dict[str, str],
2105
- retries: int = 3,
2106
- backoff_factor: float = 0.5) -> requests.Response:
2107
- """
2108
- Make a GET request with retries.
2109
-
2110
- Args:
2111
- url (str): The URL to send the GET request to.
2112
- headers (Dict[str, str]): The headers to include in the GET request.
2113
- retries (int): Number of retries for the request.
2114
- backoff_factor (float): Backoff factor for retries.
2115
-
2116
- Returns:
2117
- requests.Response: The response from the GET request.
2118
-
2119
- Raises:
2120
- requests.RequestException: If the request fails after the specified retries.
2121
- """
2122
- for attempt in range(retries):
2123
- response = requests.get(url, headers=headers)
2124
- if response.status_code == 502:
2125
- time.sleep(backoff_factor * (2 ** attempt))
2126
- else:
2127
- return response
2128
- return response
2129
-
2130
- def _delete_request(
2131
- self,
2132
- url: str,
2133
- headers: Dict[str, str],
2134
- retries: int = 3,
2135
- backoff_factor: float = 0.5) -> requests.Response:
2136
- """
2137
- Make a DELETE request with retries.
2138
-
2139
- Args:
2140
- url (str): The URL to send the DELETE request to.
2141
- headers (Dict[str, str]): The headers to include in the DELETE request.
2142
- retries (int): Number of retries for the request.
2143
- backoff_factor (float): Backoff factor for retries.
2144
-
2145
- Returns:
2146
- requests.Response: The response from the DELETE request.
2147
-
2148
- Raises:
2149
- requests.RequestException: If the request fails after the specified retries.
2150
- """
2151
- for attempt in range(retries):
2152
- response = requests.delete(url, headers=headers)
2153
- if response.status_code == 502:
2154
- time.sleep(backoff_factor * (2 ** attempt))
2155
- else:
2156
- return response
2157
- return response
2158
-
2159
- def _monitor_job_status(
2160
- self,
2161
- id: str,
2162
- headers: Dict[str, str],
2163
- poll_interval: int) -> CrawlStatusResponse:
2164
- """
2165
- Monitor the status of a crawl job until completion.
2166
-
2167
- Args:
2168
- id (str): The ID of the crawl job.
2169
- headers (Dict[str, str]): The headers to include in the status check requests.
2170
- poll_interval (int): Seconds between status checks.
2171
-
2172
- Returns:
2173
- CrawlStatusResponse: The crawl results if the job is completed successfully.
2174
-
2175
- Raises:
2176
- Exception: If the job fails or an error occurs during status checks.
2177
- """
2178
- while True:
2179
- api_url = f'{self.api_url}/v1/crawl/{id}'
2180
-
2181
- status_response = self._get_request(api_url, headers)
2182
- if status_response.status_code == 200:
2183
- try:
2184
- status_data = status_response.json()
2185
- except:
2186
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2187
- if status_data['status'] == 'completed':
2188
- if 'data' in status_data:
2189
- data = status_data['data']
2190
- while 'next' in status_data:
2191
- if len(status_data['data']) == 0:
2192
- break
2193
- status_response = self._get_request(status_data['next'], headers)
2194
- try:
2195
- status_data = status_response.json()
2196
- except:
2197
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2198
- data.extend(status_data.get('data', []))
2199
- status_data['data'] = data
2200
- return CrawlStatusResponse(**status_data)
2201
- else:
2202
- raise Exception('Crawl job completed but no data was returned')
2203
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2204
- poll_interval=max(poll_interval,2)
2205
- time.sleep(poll_interval) # Wait for the specified interval before checking again
2206
- else:
2207
- raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2208
- else:
2209
- self._handle_error(status_response, 'check crawl status')
2210
-
2211
- def _handle_error(
2212
- self,
2213
- response: requests.Response,
2214
- action: str) -> None:
2215
- """
2216
- Handle errors from API responses.
2217
-
2218
- Args:
2219
- response (requests.Response): The response object from the API request.
2220
- action (str): Description of the action that was being performed.
2221
-
2222
- Raises:
2223
- Exception: An exception with a message containing the status code and error details from the response.
2224
- """
2225
- try:
2226
- error_message = response.json().get('error', 'No error message provided.')
2227
- error_details = response.json().get('details', 'No additional error details provided.')
2228
- except:
2229
- raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2230
-
2231
- message = self._get_error_message(response.status_code, action, error_message, error_details)
2232
-
2233
- # Raise an HTTPError with the custom message and attach the response
2234
- raise requests.exceptions.HTTPError(message, response=response)
2235
-
2236
- def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2237
- """
2238
- Generate a standardized error message based on HTTP status code.
2239
-
2240
- Args:
2241
- status_code (int): The HTTP status code from the response
2242
- action (str): Description of the action that was being performed
2243
- error_message (str): The error message from the API response
2244
- error_details (str): Additional error details from the API response
2245
-
2246
- Returns:
2247
- str: A formatted error message
2248
- """
2249
- if status_code == 402:
2250
- return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2251
- elif status_code == 403:
2252
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2253
- elif status_code == 408:
2254
- return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2255
- elif status_code == 409:
2256
- return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2257
- elif status_code == 500:
2258
- return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2259
- else:
2260
- return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2261
-
2262
- def deep_research(
2263
- self,
2264
- query: str,
2265
- *,
2266
- max_depth: Optional[int] = None,
2267
- time_limit: Optional[int] = None,
2268
- max_urls: Optional[int] = None,
2269
- analysis_prompt: Optional[str] = None,
2270
- system_prompt: Optional[str] = None,
2271
- __experimental_stream_steps: Optional[bool] = None,
2272
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2273
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2274
- """
2275
- Initiates a deep research operation on a given query and polls until completion.
2276
-
2277
- Args:
2278
- query (str): Research query or topic to investigate
2279
- max_depth (Optional[int]): Maximum depth of research exploration
2280
- time_limit (Optional[int]): Time limit in seconds for research
2281
- max_urls (Optional[int]): Maximum number of URLs to process
2282
- analysis_prompt (Optional[str]): Custom prompt for analysis
2283
- system_prompt (Optional[str]): Custom system prompt
2284
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2285
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2286
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2287
-
2288
- Returns:
2289
- DeepResearchStatusResponse containing:
2290
- * success (bool): Whether research completed successfully
2291
- * status (str): Current state (processing/completed/failed)
2292
- * error (Optional[str]): Error message if failed
2293
- * id (str): Unique identifier for the research job
2294
- * data (Any): Research findings and analysis
2295
- * sources (List[Dict]): List of discovered sources
2296
- * activities (List[Dict]): Research progress log
2297
- * summaries (List[str]): Generated research summaries
2298
-
2299
- Raises:
2300
- Exception: If research fails
2301
- """
2302
- research_params = {}
2303
- if max_depth is not None:
2304
- research_params['maxDepth'] = max_depth
2305
- if time_limit is not None:
2306
- research_params['timeLimit'] = time_limit
2307
- if max_urls is not None:
2308
- research_params['maxUrls'] = max_urls
2309
- if analysis_prompt is not None:
2310
- research_params['analysisPrompt'] = analysis_prompt
2311
- if system_prompt is not None:
2312
- research_params['systemPrompt'] = system_prompt
2313
- if __experimental_stream_steps is not None:
2314
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2315
- research_params = DeepResearchParams(**research_params)
2316
-
2317
- response = self.async_deep_research(
2318
- query,
2319
- max_depth=max_depth,
2320
- time_limit=time_limit,
2321
- max_urls=max_urls,
2322
- analysis_prompt=analysis_prompt,
2323
- system_prompt=system_prompt
2324
- )
2325
- if not response.get('success') or 'id' not in response:
2326
- return response
2327
-
2328
- job_id = response['id']
2329
- last_activity_count = 0
2330
- last_source_count = 0
2331
-
2332
- while True:
2333
- status = self.check_deep_research_status(job_id)
2334
-
2335
- if on_activity and 'activities' in status:
2336
- new_activities = status['activities'][last_activity_count:]
2337
- for activity in new_activities:
2338
- on_activity(activity)
2339
- last_activity_count = len(status['activities'])
2340
-
2341
- if on_source and 'sources' in status:
2342
- new_sources = status['sources'][last_source_count:]
2343
- for source in new_sources:
2344
- on_source(source)
2345
- last_source_count = len(status['sources'])
2346
-
2347
- if status['status'] == 'completed':
2348
- return status
2349
- elif status['status'] == 'failed':
2350
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
2351
- elif status['status'] != 'processing':
2352
- break
2353
-
2354
- time.sleep(2) # Polling interval
2355
-
2356
- return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2357
-
2358
- def async_deep_research(
2359
- self,
2360
- query: str,
2361
- *,
2362
- max_depth: Optional[int] = None,
2363
- time_limit: Optional[int] = None,
2364
- max_urls: Optional[int] = None,
2365
- analysis_prompt: Optional[str] = None,
2366
- system_prompt: Optional[str] = None,
2367
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2368
- """
2369
- Initiates an asynchronous deep research operation.
2370
-
2371
- Args:
2372
- query (str): Research query or topic to investigate
2373
- max_depth (Optional[int]): Maximum depth of research exploration
2374
- time_limit (Optional[int]): Time limit in seconds for research
2375
- max_urls (Optional[int]): Maximum number of URLs to process
2376
- analysis_prompt (Optional[str]): Custom prompt for analysis
2377
- system_prompt (Optional[str]): Custom system prompt
2378
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2379
-
2380
- Returns:
2381
- Dict[str, Any]: A response containing:
2382
- * success (bool): Whether the research initiation was successful
2383
- * id (str): The unique identifier for the research job
2384
- * error (str, optional): Error message if initiation failed
2385
-
2386
- Raises:
2387
- Exception: If the research initiation fails.
2388
- """
2389
- research_params = {}
2390
- if max_depth is not None:
2391
- research_params['maxDepth'] = max_depth
2392
- if time_limit is not None:
2393
- research_params['timeLimit'] = time_limit
2394
- if max_urls is not None:
2395
- research_params['maxUrls'] = max_urls
2396
- if analysis_prompt is not None:
2397
- research_params['analysisPrompt'] = analysis_prompt
2398
- if system_prompt is not None:
2399
- research_params['systemPrompt'] = system_prompt
2400
- if __experimental_stream_steps is not None:
2401
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2402
- research_params = DeepResearchParams(**research_params)
2403
-
2404
- headers = self._prepare_headers()
2405
-
2406
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
2407
- json_data['origin'] = f"python-sdk@{version}"
2408
-
2409
- # Handle json options schema if present
2410
- if 'jsonOptions' in json_data:
2411
- json_opts = json_data['jsonOptions']
2412
- if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2413
- json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2414
-
2415
- try:
2416
- response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2417
- if response.status_code == 200:
2418
- try:
2419
- return response.json()
2420
- except:
2421
- raise Exception('Failed to parse Firecrawl response as JSON.')
2422
- else:
2423
- self._handle_error(response, 'start deep research')
2424
- except Exception as e:
2425
- raise ValueError(str(e))
2426
-
2427
- return {'success': False, 'error': 'Internal server error'}
2428
-
2429
- def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2430
- """
2431
- Check the status of a deep research operation.
2432
-
2433
- Args:
2434
- id (str): The ID of the deep research operation.
2435
-
2436
- Returns:
2437
- DeepResearchResponse containing:
2438
-
2439
- Status:
2440
- * success - Whether research completed successfully
2441
- * status - Current state (processing/completed/failed)
2442
- * error - Error message if failed
2443
-
2444
- Results:
2445
- * id - Unique identifier for the research job
2446
- * data - Research findings and analysis
2447
- * sources - List of discovered sources
2448
- * activities - Research progress log
2449
- * summaries - Generated research summaries
2450
-
2451
- Raises:
2452
- Exception: If the status check fails.
2453
- """
2454
- headers = self._prepare_headers()
2455
- try:
2456
- response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2457
- if response.status_code == 200:
2458
- try:
2459
- return response.json()
2460
- except:
2461
- raise Exception('Failed to parse Firecrawl response as JSON.')
2462
- elif response.status_code == 404:
2463
- raise Exception('Deep research job not found')
2464
- else:
2465
- self._handle_error(response, 'check deep research status')
2466
- except Exception as e:
2467
- raise ValueError(str(e))
2468
-
2469
- return {'success': False, 'error': 'Internal server error'}
2470
-
2471
- def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2472
- """
2473
- Validate additional keyword arguments before they are passed to the API.
2474
- This provides early validation before the Pydantic model validation.
2475
-
2476
- Args:
2477
- kwargs (Dict[str, Any]): Additional keyword arguments to validate
2478
- method_name (str): Name of the method these kwargs are for
2479
-
2480
- Raises:
2481
- ValueError: If kwargs contain invalid or unsupported parameters
2482
- """
2483
- if not kwargs:
2484
- return
2485
-
2486
- # Known parameter mappings for each method
2487
- method_params = {
2488
- "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2489
- "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2490
- "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
2491
- "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2492
- "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2493
- "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2494
- "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2495
- "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2496
- "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2497
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2498
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2499
- "actions", "agent", "webhook"},
2500
- "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2501
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2502
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2503
- "actions", "agent", "webhook"},
2504
- "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2505
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2506
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2507
- "actions", "agent", "webhook"}
2508
- }
2509
-
2510
- # Get allowed parameters for this method
2511
- allowed_params = method_params.get(method_name, set())
2512
-
2513
- # Check for unknown parameters
2514
- unknown_params = set(kwargs.keys()) - allowed_params
2515
- if unknown_params:
2516
- raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2517
-
2518
- # Additional type validation can be added here if needed
2519
- # For now, we rely on Pydantic models for detailed type validation
2520
-
2521
- def _ensure_schema_dict(self, schema):
2522
- """
2523
- Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2524
- """
2525
- if schema is None:
2526
- return schema
2527
- if isinstance(schema, type):
2528
- # Pydantic v1/v2 model class
2529
- if hasattr(schema, 'model_json_schema'):
2530
- return schema.model_json_schema()
2531
- elif hasattr(schema, 'schema'):
2532
- return schema.schema()
2533
- if isinstance(schema, dict):
2534
- return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2535
- if isinstance(schema, (list, tuple)):
2536
- return [self._ensure_schema_dict(v) for v in schema]
2537
- return schema
2538
-
2539
- class CrawlWatcher:
2540
- """
2541
- A class to watch and handle crawl job events via WebSocket connection.
2542
-
2543
- Attributes:
2544
- id (str): The ID of the crawl job to watch
2545
- app (FirecrawlApp): The FirecrawlApp instance
2546
- data (List[Dict[str, Any]]): List of crawled documents/data
2547
- status (str): Current status of the crawl job
2548
- ws_url (str): WebSocket URL for the crawl job
2549
- event_handlers (dict): Dictionary of event type to list of handler functions
2550
- """
2551
- def __init__(self, id: str, app: FirecrawlApp):
2552
- self.id = id
2553
- self.app = app
2554
- self.data: List[Dict[str, Any]] = []
2555
- self.status = "scraping"
2556
- self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2557
- self.event_handlers = {
2558
- 'done': [],
2559
- 'error': [],
2560
- 'document': []
2561
- }
2562
-
2563
- async def connect(self) -> None:
2564
- """
2565
- Establishes WebSocket connection and starts listening for messages.
2566
- """
2567
- async with websockets.connect(
2568
- self.ws_url,
2569
- max_size=None,
2570
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2571
- ) as websocket:
2572
- await self._listen(websocket)
2573
-
2574
- async def _listen(self, websocket) -> None:
2575
- """
2576
- Listens for incoming WebSocket messages and handles them.
2577
-
2578
- Args:
2579
- websocket: The WebSocket connection object
2580
- """
2581
- async for message in websocket:
2582
- msg = json.loads(message)
2583
- await self._handle_message(msg)
2584
-
2585
- def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2586
- """
2587
- Adds an event handler function for a specific event type.
2588
-
2589
- Args:
2590
- event_type (str): Type of event to listen for ('done', 'error', or 'document')
2591
- handler (Callable): Function to handle the event
2592
- """
2593
- if event_type in self.event_handlers:
2594
- self.event_handlers[event_type].append(handler)
2595
-
2596
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2597
- """
2598
- Dispatches an event to all registered handlers for that event type.
2599
-
2600
- Args:
2601
- event_type (str): Type of event to dispatch
2602
- detail (Dict[str, Any]): Event details/data to pass to handlers
2603
- """
2604
- if event_type in self.event_handlers:
2605
- for handler in self.event_handlers[event_type]:
2606
- handler(detail)
2607
-
2608
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
2609
- """
2610
- Handles incoming WebSocket messages based on their type.
2611
-
2612
- Args:
2613
- msg (Dict[str, Any]): The message to handle
2614
- """
2615
- if msg['type'] == 'done':
2616
- self.status = 'completed'
2617
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2618
- elif msg['type'] == 'error':
2619
- self.status = 'failed'
2620
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2621
- elif msg['type'] == 'catchup':
2622
- self.status = msg['data']['status']
2623
- self.data.extend(msg['data'].get('data', []))
2624
- for doc in self.data:
2625
- self.dispatch_event('document', {'data': doc, 'id': self.id})
2626
- elif msg['type'] == 'document':
2627
- self.data.append(msg['data'])
2628
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2629
-
2630
- class AsyncFirecrawlApp(FirecrawlApp):
2631
- """
2632
- Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2633
- Provides non-blocking alternatives to all FirecrawlApp operations.
2634
- """
2635
-
2636
- async def _async_request(
2637
- self,
2638
- method: str,
2639
- url: str,
2640
- headers: Dict[str, str],
2641
- data: Optional[Dict[str, Any]] = None,
2642
- retries: int = 3,
2643
- backoff_factor: float = 0.5) -> Dict[str, Any]:
2644
- """
2645
- Generic async request method with exponential backoff retry logic.
2646
-
2647
- Args:
2648
- method (str): The HTTP method to use (e.g., "GET" or "POST").
2649
- url (str): The URL to send the request to.
2650
- headers (Dict[str, str]): Headers to include in the request.
2651
- data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2652
- retries (int): Maximum number of retry attempts (default: 3).
2653
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2654
- Delay will be backoff_factor * (2 ** retry_count).
2655
-
2656
- Returns:
2657
- Dict[str, Any]: The parsed JSON response from the server.
2658
-
2659
- Raises:
2660
- aiohttp.ClientError: If the request fails after all retries.
2661
- Exception: If max retries are exceeded or other errors occur.
2662
- """
2663
- async with aiohttp.ClientSession() as session:
2664
- for attempt in range(retries):
2665
- try:
2666
- async with session.request(
2667
- method=method, url=url, headers=headers, json=data
2668
- ) as response:
2669
- if response.status == 502:
2670
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2671
- continue
2672
- if response.status >= 300:
2673
- await self._handle_error(response, f"make {method} request")
2674
- return await response.json()
2675
- except aiohttp.ClientError as e:
2676
- if attempt == retries - 1:
2677
- raise e
2678
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2679
- raise Exception("Max retries exceeded")
2680
-
2681
- async def _async_post_request(
2682
- self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2683
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2684
- """
2685
- Make an async POST request with exponential backoff retry logic.
2686
-
2687
- Args:
2688
- url (str): The URL to send the POST request to.
2689
- data (Dict[str, Any]): The JSON data to include in the request body.
2690
- headers (Dict[str, str]): Headers to include in the request.
2691
- retries (int): Maximum number of retry attempts (default: 3).
2692
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2693
- Delay will be backoff_factor * (2 ** retry_count).
2694
-
2695
- Returns:
2696
- Dict[str, Any]: The parsed JSON response from the server.
2697
-
2698
- Raises:
2699
- aiohttp.ClientError: If the request fails after all retries.
2700
- Exception: If max retries are exceeded or other errors occur.
2701
- """
2702
- return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2703
-
2704
- async def _async_get_request(
2705
- self, url: str, headers: Dict[str, str],
2706
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2707
- """
2708
- Make an async GET request with exponential backoff retry logic.
2709
-
2710
- Args:
2711
- url (str): The URL to send the GET request to.
2712
- headers (Dict[str, str]): Headers to include in the request.
2713
- retries (int): Maximum number of retry attempts (default: 3).
2714
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2715
- Delay will be backoff_factor * (2 ** retry_count).
2716
-
2717
- Returns:
2718
- Dict[str, Any]: The parsed JSON response from the server.
2719
-
2720
- Raises:
2721
- aiohttp.ClientError: If the request fails after all retries.
2722
- Exception: If max retries are exceeded or other errors occur.
2723
- """
2724
- return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2725
-
2726
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2727
- """
2728
- Handle errors from async API responses with detailed error messages.
2729
-
2730
- Args:
2731
- response (aiohttp.ClientResponse): The response object from the failed request
2732
- action (str): Description of the action that was being attempted
2733
-
2734
- Raises:
2735
- aiohttp.ClientError: With a detailed error message based on the response status:
2736
- - 402: Payment Required
2737
- - 408: Request Timeout
2738
- - 409: Conflict
2739
- - 500: Internal Server Error
2740
- - Other: Unexpected error with status code
2741
- """
2742
- try:
2743
- error_data = await response.json()
2744
- error_message = error_data.get('error', 'No error message provided.')
2745
- error_details = error_data.get('details', 'No additional error details provided.')
2746
- except:
2747
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2748
-
2749
- message = await self._get_async_error_message(response.status, action, error_message, error_details)
2750
-
2751
- raise aiohttp.ClientError(message)
2752
-
2753
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2754
- """
2755
- Generate a standardized error message based on HTTP status code for async operations.
2756
-
2757
- Args:
2758
- status_code (int): The HTTP status code from the response
2759
- action (str): Description of the action that was being performed
2760
- error_message (str): The error message from the API response
2761
- error_details (str): Additional error details from the API response
2762
-
2763
- Returns:
2764
- str: A formatted error message
2765
- """
2766
- return self._get_error_message(status_code, action, error_message, error_details)
2767
-
2768
- async def crawl_url_and_watch(
2769
- self,
2770
- url: str,
2771
- params: Optional[CrawlParams] = None,
2772
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2773
- """
2774
- Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2775
-
2776
- Args:
2777
- url (str): Target URL to start crawling from
2778
- params (Optional[CrawlParams]): See CrawlParams model for configuration:
2779
- URL Discovery:
2780
- * includePaths - Patterns of URLs to include
2781
- * excludePaths - Patterns of URLs to exclude
2782
- * maxDepth - Maximum crawl depth
2783
- * maxDiscoveryDepth - Maximum depth for finding new URLs
2784
- * limit - Maximum pages to crawl
2785
-
2786
- Link Following:
2787
- * allowBackwardLinks - Follow parent directory links
2788
- * allowExternalLinks - Follow external domain links
2789
- * ignoreSitemap - Skip sitemap.xml processing
2790
-
2791
- Advanced:
2792
- * scrapeOptions - Page scraping configuration
2793
- * webhook - Notification webhook settings
2794
- * deduplicateSimilarURLs - Remove similar URLs
2795
- * ignoreQueryParameters - Ignore URL parameters
2796
- * regexOnFullURL - Apply regex to full URLs
2797
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2798
-
2799
- Returns:
2800
- AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2801
-
2802
- Raises:
2803
- Exception: If crawl job fails to start
2804
- """
2805
- crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2806
- if crawl_response.get('success') and 'id' in crawl_response:
2807
- return AsyncCrawlWatcher(crawl_response['id'], self)
2808
- else:
2809
- raise Exception("Crawl job failed to start")
2810
-
2811
- async def batch_scrape_urls_and_watch(
2812
- self,
2813
- urls: List[str],
2814
- params: Optional[ScrapeParams] = None,
2815
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2816
- """
2817
- Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2818
-
2819
- Args:
2820
- urls (List[str]): List of URLs to scrape
2821
- params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2822
-
2823
- Content Options:
2824
- * formats - Content formats to retrieve
2825
- * includeTags - HTML tags to include
2826
- * excludeTags - HTML tags to exclude
2827
- * onlyMainContent - Extract main content only
2828
-
2829
- Request Options:
2830
- * headers - Custom HTTP headers
2831
- * timeout - Request timeout (ms)
2832
- * mobile - Use mobile user agent
2833
- * proxy - Proxy type
2834
-
2835
- Extraction Options:
2836
- * extract - Content extraction config
2837
- * jsonOptions - JSON extraction config
2838
- * actions - Actions to perform
2839
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2840
-
2841
- Returns:
2842
- AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2843
-
2844
- Raises:
2845
- Exception: If batch scrape job fails to start
2846
- """
2847
- batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2848
- if batch_response.get('success') and 'id' in batch_response:
2849
- return AsyncCrawlWatcher(batch_response['id'], self)
2850
- else:
2851
- raise Exception("Batch scrape job failed to start")
2852
-
2853
- async def scrape_url(
2854
- self,
2855
- url: str,
2856
- *,
2857
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2858
- include_tags: Optional[List[str]] = None,
2859
- exclude_tags: Optional[List[str]] = None,
2860
- only_main_content: Optional[bool] = None,
2861
- wait_for: Optional[int] = None,
2862
- timeout: Optional[int] = None,
2863
- location: Optional[LocationConfig] = None,
2864
- mobile: Optional[bool] = None,
2865
- skip_tls_verification: Optional[bool] = None,
2866
- remove_base64_images: Optional[bool] = None,
2867
- block_ads: Optional[bool] = None,
2868
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2869
- extract: Optional[JsonConfig] = None,
2870
- json_options: Optional[JsonConfig] = None,
2871
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2872
- **kwargs) -> ScrapeResponse[Any]:
2873
- """
2874
- Scrape a single URL asynchronously.
2875
-
2876
- Args:
2877
- url (str): Target URL to scrape
2878
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2879
- include_tags (Optional[List[str]]): HTML tags to include
2880
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2881
- only_main_content (Optional[bool]): Extract main content only
2882
- wait_for (Optional[int]): Wait for a specific element to appear
2883
- timeout (Optional[int]): Request timeout (ms)
2884
- location (Optional[LocationConfig]): Location configuration
2885
- mobile (Optional[bool]): Use mobile user agent
2886
- skip_tls_verification (Optional[bool]): Skip TLS verification
2887
- remove_base64_images (Optional[bool]): Remove base64 images
2888
- block_ads (Optional[bool]): Block ads
2889
- proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
2890
- extract (Optional[JsonConfig]): Content extraction settings
2891
- json_options (Optional[JsonConfig]): JSON extraction settings
2892
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2893
- **kwargs: Additional parameters to pass to the API
2894
-
2895
- Returns:
2896
- ScrapeResponse with:
2897
- * success - Whether scrape was successful
2898
- * markdown - Markdown content if requested
2899
- * html - HTML content if requested
2900
- * rawHtml - Raw HTML content if requested
2901
- * links - Extracted links if requested
2902
- * screenshot - Screenshot if requested
2903
- * extract - Extracted data if requested
2904
- * json - JSON data if requested
2905
- * error - Error message if scrape failed
2906
-
2907
- Raises:
2908
- Exception: If scraping fails
2909
- """
2910
- # Validate any additional kwargs
2911
- self._validate_kwargs(kwargs, "scrape_url")
2912
-
2913
- headers = self._prepare_headers()
2914
-
2915
- # Build scrape parameters
2916
- scrape_params = {
2917
- 'url': url,
2918
- 'origin': f"python-sdk@{version}"
2919
- }
2920
-
2921
- # Add optional parameters if provided and not None
2922
- if formats:
2923
- scrape_params['formats'] = formats
2924
- if include_tags:
2925
- scrape_params['includeTags'] = include_tags
2926
- if exclude_tags:
2927
- scrape_params['excludeTags'] = exclude_tags
2928
- if only_main_content is not None:
2929
- scrape_params['onlyMainContent'] = only_main_content
2930
- if wait_for:
2931
- scrape_params['waitFor'] = wait_for
2932
- if timeout:
2933
- scrape_params['timeout'] = timeout
2934
- if location:
2935
- scrape_params['location'] = location.dict(exclude_none=True)
2936
- if mobile is not None:
2937
- scrape_params['mobile'] = mobile
2938
- if skip_tls_verification is not None:
2939
- scrape_params['skipTlsVerification'] = skip_tls_verification
2940
- if remove_base64_images is not None:
2941
- scrape_params['removeBase64Images'] = remove_base64_images
2942
- if block_ads is not None:
2943
- scrape_params['blockAds'] = block_ads
2944
- if proxy:
2945
- scrape_params['proxy'] = proxy
2946
- if extract is not None:
2947
- extract = self._ensure_schema_dict(extract)
2948
- if isinstance(extract, dict) and "schema" in extract:
2949
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
2950
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
2951
- if json_options is not None:
2952
- json_options = self._ensure_schema_dict(json_options)
2953
- if isinstance(json_options, dict) and "schema" in json_options:
2954
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
2955
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
2956
- if actions:
2957
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
2958
-
2959
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
2960
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
2961
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
2962
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
2963
-
2964
- # Make async request
2965
- endpoint = f'/v1/scrape'
2966
- response = await self._async_post_request(
2967
- f'{self.api_url}{endpoint}',
2968
- scrape_params,
2969
- headers
2970
- )
2971
-
2972
- if response.get('success') and 'data' in response:
2973
- return ScrapeResponse(**response['data'])
2974
- elif "error" in response:
2975
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2976
- else:
2977
- # Use the response content directly if possible, otherwise a generic message
2978
- error_content = response.get('error', str(response))
2979
- raise Exception(f'Failed to scrape URL. Error: {error_content}')
2980
-
2981
- async def batch_scrape_urls(
2982
- self,
2983
- urls: List[str],
2984
- *,
2985
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2986
- headers: Optional[Dict[str, str]] = None,
2987
- include_tags: Optional[List[str]] = None,
2988
- exclude_tags: Optional[List[str]] = None,
2989
- only_main_content: Optional[bool] = None,
2990
- wait_for: Optional[int] = None,
2991
- timeout: Optional[int] = None,
2992
- location: Optional[LocationConfig] = None,
2993
- mobile: Optional[bool] = None,
2994
- skip_tls_verification: Optional[bool] = None,
2995
- remove_base64_images: Optional[bool] = None,
2996
- block_ads: Optional[bool] = None,
2997
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2998
- extract: Optional[JsonConfig] = None,
2999
- json_options: Optional[JsonConfig] = None,
3000
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3001
- agent: Optional[AgentOptions] = None,
3002
- poll_interval: Optional[int] = 2,
3003
- idempotency_key: Optional[str] = None,
3004
- **kwargs
3005
- ) -> BatchScrapeStatusResponse:
3006
- """
3007
- Asynchronously scrape multiple URLs and monitor until completion.
3008
-
3009
- Args:
3010
- urls (List[str]): URLs to scrape
3011
- formats (Optional[List[Literal]]): Content formats to retrieve
3012
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3013
- include_tags (Optional[List[str]]): HTML tags to include
3014
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3015
- only_main_content (Optional[bool]): Extract main content only
3016
- wait_for (Optional[int]): Wait time in milliseconds
3017
- timeout (Optional[int]): Request timeout in milliseconds
3018
- location (Optional[LocationConfig]): Location configuration
3019
- mobile (Optional[bool]): Use mobile user agent
3020
- skip_tls_verification (Optional[bool]): Skip TLS verification
3021
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3022
- block_ads (Optional[bool]): Block advertisements
3023
- proxy (Optional[Literal]): Proxy type to use
3024
- extract (Optional[JsonConfig]): Content extraction config
3025
- json_options (Optional[JsonConfig]): JSON extraction config
3026
- actions (Optional[List[Union]]): Actions to perform
3027
- agent (Optional[AgentOptions]): Agent configuration
3028
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3029
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3030
- **kwargs: Additional parameters to pass to the API
3031
-
3032
- Returns:
3033
- BatchScrapeStatusResponse with:
3034
- * Scraping status and progress
3035
- * Scraped content for each URL
3036
- * Success/error information
3037
-
3038
- Raises:
3039
- Exception: If batch scrape fails
3040
- """
3041
- # Validate any additional kwargs
3042
- self._validate_kwargs(kwargs, "batch_scrape_urls")
3043
-
3044
- scrape_params = {}
3045
-
3046
- # Add individual parameters
3047
- if formats is not None:
3048
- scrape_params['formats'] = formats
3049
- if headers is not None:
3050
- scrape_params['headers'] = headers
3051
- if include_tags is not None:
3052
- scrape_params['includeTags'] = include_tags
3053
- if exclude_tags is not None:
3054
- scrape_params['excludeTags'] = exclude_tags
3055
- if only_main_content is not None:
3056
- scrape_params['onlyMainContent'] = only_main_content
3057
- if wait_for is not None:
3058
- scrape_params['waitFor'] = wait_for
3059
- if timeout is not None:
3060
- scrape_params['timeout'] = timeout
3061
- if location is not None:
3062
- scrape_params['location'] = location.dict(exclude_none=True)
3063
- if mobile is not None:
3064
- scrape_params['mobile'] = mobile
3065
- if skip_tls_verification is not None:
3066
- scrape_params['skipTlsVerification'] = skip_tls_verification
3067
- if remove_base64_images is not None:
3068
- scrape_params['removeBase64Images'] = remove_base64_images
3069
- if block_ads is not None:
3070
- scrape_params['blockAds'] = block_ads
3071
- if proxy is not None:
3072
- scrape_params['proxy'] = proxy
3073
- if extract is not None:
3074
- extract = self._ensure_schema_dict(extract)
3075
- if isinstance(extract, dict) and "schema" in extract:
3076
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3077
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3078
- if json_options is not None:
3079
- json_options = self._ensure_schema_dict(json_options)
3080
- if isinstance(json_options, dict) and "schema" in json_options:
3081
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3082
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3083
- if actions is not None:
3084
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3085
- if agent is not None:
3086
- scrape_params['agent'] = agent.dict(exclude_none=True)
3087
-
3088
- # Add any additional kwargs
3089
- scrape_params.update(kwargs)
3090
-
3091
- # Create final params object
3092
- final_params = ScrapeParams(**scrape_params)
3093
- params_dict = final_params.dict(exclude_none=True)
3094
- params_dict['urls'] = urls
3095
- params_dict['origin'] = f"python-sdk@{version}"
3096
-
3097
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3098
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3099
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3100
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3101
-
3102
- # Make request
3103
- headers = self._prepare_headers(idempotency_key)
3104
- response = await self._async_post_request(
3105
- f'{self.api_url}/v1/batch/scrape',
3106
- params_dict,
3107
- headers
3108
- )
3109
-
3110
- if response.get('success'):
3111
- try:
3112
- id = response.get('id')
3113
- except:
3114
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3115
- return await self._async_monitor_job_status(id, headers, poll_interval)
3116
- else:
3117
- self._handle_error(response, 'start batch scrape job')
3118
-
3119
-
3120
- async def async_batch_scrape_urls(
3121
- self,
3122
- urls: List[str],
3123
- *,
3124
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3125
- headers: Optional[Dict[str, str]] = None,
3126
- include_tags: Optional[List[str]] = None,
3127
- exclude_tags: Optional[List[str]] = None,
3128
- only_main_content: Optional[bool] = None,
3129
- wait_for: Optional[int] = None,
3130
- timeout: Optional[int] = None,
3131
- location: Optional[LocationConfig] = None,
3132
- mobile: Optional[bool] = None,
3133
- skip_tls_verification: Optional[bool] = None,
3134
- remove_base64_images: Optional[bool] = None,
3135
- block_ads: Optional[bool] = None,
3136
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3137
- extract: Optional[JsonConfig] = None,
3138
- json_options: Optional[JsonConfig] = None,
3139
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3140
- agent: Optional[AgentOptions] = None,
3141
- idempotency_key: Optional[str] = None,
3142
- **kwargs
3143
- ) -> BatchScrapeResponse:
3144
- """
3145
- Initiate a batch scrape job asynchronously.
3146
-
3147
- Args:
3148
- urls (List[str]): URLs to scrape
3149
- formats (Optional[List[Literal]]): Content formats to retrieve
3150
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3151
- include_tags (Optional[List[str]]): HTML tags to include
3152
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3153
- only_main_content (Optional[bool]): Extract main content only
3154
- wait_for (Optional[int]): Wait time in milliseconds
3155
- timeout (Optional[int]): Request timeout in milliseconds
3156
- location (Optional[LocationConfig]): Location configuration
3157
- mobile (Optional[bool]): Use mobile user agent
3158
- skip_tls_verification (Optional[bool]): Skip TLS verification
3159
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3160
- block_ads (Optional[bool]): Block advertisements
3161
- proxy (Optional[Literal]): Proxy type to use
3162
- extract (Optional[JsonConfig]): Content extraction config
3163
- json_options (Optional[JsonConfig]): JSON extraction config
3164
- actions (Optional[List[Union]]): Actions to perform
3165
- agent (Optional[AgentOptions]): Agent configuration
3166
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3167
- **kwargs: Additional parameters to pass to the API
3168
-
3169
- Returns:
3170
- BatchScrapeResponse with:
3171
- * success - Whether job started successfully
3172
- * id - Unique identifier for the job
3173
- * url - Status check URL
3174
- * error - Error message if start failed
3175
-
3176
- Raises:
3177
- Exception: If job initiation fails
3178
- """
3179
- # Validate any additional kwargs
3180
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3181
-
3182
- scrape_params = {}
3183
-
3184
- # Add individual parameters
3185
- if formats is not None:
3186
- scrape_params['formats'] = formats
3187
- if headers is not None:
3188
- scrape_params['headers'] = headers
3189
- if include_tags is not None:
3190
- scrape_params['includeTags'] = include_tags
3191
- if exclude_tags is not None:
3192
- scrape_params['excludeTags'] = exclude_tags
3193
- if only_main_content is not None:
3194
- scrape_params['onlyMainContent'] = only_main_content
3195
- if wait_for is not None:
3196
- scrape_params['waitFor'] = wait_for
3197
- if timeout is not None:
3198
- scrape_params['timeout'] = timeout
3199
- if location is not None:
3200
- scrape_params['location'] = location.dict(exclude_none=True)
3201
- if mobile is not None:
3202
- scrape_params['mobile'] = mobile
3203
- if skip_tls_verification is not None:
3204
- scrape_params['skipTlsVerification'] = skip_tls_verification
3205
- if remove_base64_images is not None:
3206
- scrape_params['removeBase64Images'] = remove_base64_images
3207
- if block_ads is not None:
3208
- scrape_params['blockAds'] = block_ads
3209
- if proxy is not None:
3210
- scrape_params['proxy'] = proxy
3211
- if extract is not None:
3212
- extract = self._ensure_schema_dict(extract)
3213
- if isinstance(extract, dict) and "schema" in extract:
3214
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3215
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3216
- if json_options is not None:
3217
- json_options = self._ensure_schema_dict(json_options)
3218
- if isinstance(json_options, dict) and "schema" in json_options:
3219
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3220
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3221
- if actions is not None:
3222
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3223
- if agent is not None:
3224
- scrape_params['agent'] = agent.dict(exclude_none=True)
3225
-
3226
- # Add any additional kwargs
3227
- scrape_params.update(kwargs)
3228
-
3229
- # Create final params object
3230
- final_params = ScrapeParams(**scrape_params)
3231
- params_dict = final_params.dict(exclude_none=True)
3232
- params_dict['urls'] = urls
3233
- params_dict['origin'] = f"python-sdk@{version}"
3234
-
3235
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3236
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3237
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3238
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3239
-
3240
- # Make request
3241
- headers = self._prepare_headers(idempotency_key)
3242
- response = await self._async_post_request(
3243
- f'{self.api_url}/v1/batch/scrape',
3244
- params_dict,
3245
- headers
3246
- )
3247
-
3248
- if response.get('status_code') == 200:
3249
- try:
3250
- return BatchScrapeResponse(**response.json())
3251
- except:
3252
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3253
- else:
3254
- self._handle_error(response, 'start batch scrape job')
3255
-
3256
- async def crawl_url(
3257
- self,
3258
- url: str,
3259
- *,
3260
- include_paths: Optional[List[str]] = None,
3261
- exclude_paths: Optional[List[str]] = None,
3262
- max_depth: Optional[int] = None,
3263
- max_discovery_depth: Optional[int] = None,
3264
- limit: Optional[int] = None,
3265
- allow_backward_links: Optional[bool] = None,
3266
- allow_external_links: Optional[bool] = None,
3267
- ignore_sitemap: Optional[bool] = None,
3268
- scrape_options: Optional[ScrapeOptions] = None,
3269
- webhook: Optional[Union[str, WebhookConfig]] = None,
3270
- deduplicate_similar_urls: Optional[bool] = None,
3271
- ignore_query_parameters: Optional[bool] = None,
3272
- regex_on_full_url: Optional[bool] = None,
3273
- delay: Optional[int] = None,
3274
- poll_interval: Optional[int] = 2,
3275
- idempotency_key: Optional[str] = None,
3276
- **kwargs
3277
- ) -> CrawlStatusResponse:
3278
- """
3279
- Crawl a website starting from a URL.
3280
-
3281
- Args:
3282
- url (str): Target URL to start crawling from
3283
- include_paths (Optional[List[str]]): Patterns of URLs to include
3284
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3285
- max_depth (Optional[int]): Maximum crawl depth
3286
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3287
- limit (Optional[int]): Maximum pages to crawl
3288
- allow_backward_links (Optional[bool]): Follow parent directory links
3289
- allow_external_links (Optional[bool]): Follow external domain links
3290
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3291
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3292
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3293
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3294
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3295
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3296
- delay (Optional[int]): Delay in seconds between scrapes
3297
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3298
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3299
- **kwargs: Additional parameters to pass to the API
3300
-
3301
- Returns:
3302
- CrawlStatusResponse with:
3303
- * Crawling status and progress
3304
- * Crawled page contents
3305
- * Success/error information
3306
-
3307
- Raises:
3308
- Exception: If crawl fails
3309
- """
3310
- # Validate any additional kwargs
3311
- self._validate_kwargs(kwargs, "crawl_url")
3312
-
3313
- crawl_params = {}
3314
-
3315
- # Add individual parameters
3316
- if include_paths is not None:
3317
- crawl_params['includePaths'] = include_paths
3318
- if exclude_paths is not None:
3319
- crawl_params['excludePaths'] = exclude_paths
3320
- if max_depth is not None:
3321
- crawl_params['maxDepth'] = max_depth
3322
- if max_discovery_depth is not None:
3323
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3324
- if limit is not None:
3325
- crawl_params['limit'] = limit
3326
- if allow_backward_links is not None:
3327
- crawl_params['allowBackwardLinks'] = allow_backward_links
3328
- if allow_external_links is not None:
3329
- crawl_params['allowExternalLinks'] = allow_external_links
3330
- if ignore_sitemap is not None:
3331
- crawl_params['ignoreSitemap'] = ignore_sitemap
3332
- if scrape_options is not None:
3333
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3334
- if webhook is not None:
3335
- crawl_params['webhook'] = webhook
3336
- if deduplicate_similar_urls is not None:
3337
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3338
- if ignore_query_parameters is not None:
3339
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3340
- if regex_on_full_url is not None:
3341
- crawl_params['regexOnFullURL'] = regex_on_full_url
3342
- if delay is not None:
3343
- crawl_params['delay'] = delay
3344
-
3345
- # Add any additional kwargs
3346
- crawl_params.update(kwargs)
3347
-
3348
- # Create final params object
3349
- final_params = CrawlParams(**crawl_params)
3350
- params_dict = final_params.dict(exclude_none=True)
3351
- params_dict['url'] = url
3352
- params_dict['origin'] = f"python-sdk@{version}"
3353
- # Make request
3354
- headers = self._prepare_headers(idempotency_key)
3355
- response = await self._async_post_request(
3356
- f'{self.api_url}/v1/crawl', params_dict, headers)
3357
-
3358
- if response.get('success'):
3359
- try:
3360
- id = response.get('id')
3361
- except:
3362
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3363
- return await self._async_monitor_job_status(id, headers, poll_interval)
3364
- else:
3365
- self._handle_error(response, 'start crawl job')
3366
-
3367
-
3368
- async def async_crawl_url(
3369
- self,
3370
- url: str,
3371
- *,
3372
- include_paths: Optional[List[str]] = None,
3373
- exclude_paths: Optional[List[str]] = None,
3374
- max_depth: Optional[int] = None,
3375
- max_discovery_depth: Optional[int] = None,
3376
- limit: Optional[int] = None,
3377
- allow_backward_links: Optional[bool] = None,
3378
- allow_external_links: Optional[bool] = None,
3379
- ignore_sitemap: Optional[bool] = None,
3380
- scrape_options: Optional[ScrapeOptions] = None,
3381
- webhook: Optional[Union[str, WebhookConfig]] = None,
3382
- deduplicate_similar_urls: Optional[bool] = None,
3383
- ignore_query_parameters: Optional[bool] = None,
3384
- regex_on_full_url: Optional[bool] = None,
3385
- delay: Optional[int] = None,
3386
- poll_interval: Optional[int] = 2,
3387
- idempotency_key: Optional[str] = None,
3388
- **kwargs
3389
- ) -> CrawlResponse:
3390
- """
3391
- Start an asynchronous crawl job.
3392
-
3393
- Args:
3394
- url (str): Target URL to start crawling from
3395
- include_paths (Optional[List[str]]): Patterns of URLs to include
3396
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3397
- max_depth (Optional[int]): Maximum crawl depth
3398
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3399
- limit (Optional[int]): Maximum pages to crawl
3400
- allow_backward_links (Optional[bool]): Follow parent directory links
3401
- allow_external_links (Optional[bool]): Follow external domain links
3402
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3403
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3404
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3405
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3406
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3407
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3408
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3409
- **kwargs: Additional parameters to pass to the API
3410
-
3411
- Returns:
3412
- CrawlResponse with:
3413
- * success - Whether crawl started successfully
3414
- * id - Unique identifier for the crawl job
3415
- * url - Status check URL for the crawl
3416
- * error - Error message if start failed
3417
-
3418
- Raises:
3419
- Exception: If crawl initiation fails
3420
- """
3421
- crawl_params = {}
3422
-
3423
- # Add individual parameters
3424
- if include_paths is not None:
3425
- crawl_params['includePaths'] = include_paths
3426
- if exclude_paths is not None:
3427
- crawl_params['excludePaths'] = exclude_paths
3428
- if max_depth is not None:
3429
- crawl_params['maxDepth'] = max_depth
3430
- if max_discovery_depth is not None:
3431
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3432
- if limit is not None:
3433
- crawl_params['limit'] = limit
3434
- if allow_backward_links is not None:
3435
- crawl_params['allowBackwardLinks'] = allow_backward_links
3436
- if allow_external_links is not None:
3437
- crawl_params['allowExternalLinks'] = allow_external_links
3438
- if ignore_sitemap is not None:
3439
- crawl_params['ignoreSitemap'] = ignore_sitemap
3440
- if scrape_options is not None:
3441
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3442
- if webhook is not None:
3443
- crawl_params['webhook'] = webhook
3444
- if deduplicate_similar_urls is not None:
3445
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3446
- if ignore_query_parameters is not None:
3447
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3448
- if regex_on_full_url is not None:
3449
- crawl_params['regexOnFullURL'] = regex_on_full_url
3450
- if delay is not None:
3451
- crawl_params['delay'] = delay
3452
-
3453
- # Add any additional kwargs
3454
- crawl_params.update(kwargs)
3455
-
3456
- # Create final params object
3457
- final_params = CrawlParams(**crawl_params)
3458
- params_dict = final_params.dict(exclude_none=True)
3459
- params_dict['url'] = url
3460
- params_dict['origin'] = f"python-sdk@{version}"
3461
-
3462
- # Make request
3463
- headers = self._prepare_headers(idempotency_key)
3464
- response = await self._async_post_request(
3465
- f'{self.api_url}/v1/crawl',
3466
- params_dict,
3467
- headers
3468
- )
3469
-
3470
- if response.get('success'):
3471
- try:
3472
- return CrawlResponse(**response)
3473
- except:
3474
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3475
- else:
3476
- self._handle_error(response, 'start crawl job')
3477
-
3478
- async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3479
- """
3480
- Check the status and results of an asynchronous crawl job.
3481
-
3482
- Args:
3483
- id (str): Unique identifier for the crawl job
3484
-
3485
- Returns:
3486
- CrawlStatusResponse containing:
3487
- Status Information:
3488
- * status - Current state (scraping/completed/failed/cancelled)
3489
- * completed - Number of pages crawled
3490
- * total - Total pages to crawl
3491
- * creditsUsed - API credits consumed
3492
- * expiresAt - Data expiration timestamp
3493
-
3494
- Results:
3495
- * data - List of crawled documents
3496
- * next - URL for next page of results (if paginated)
3497
- * success - Whether status check succeeded
3498
- * error - Error message if failed
3499
-
3500
- Raises:
3501
- Exception: If status check fails
3502
- """
3503
- headers = self._prepare_headers()
3504
- endpoint = f'/v1/crawl/{id}'
3505
-
3506
- status_data = await self._async_get_request(
3507
- f'{self.api_url}{endpoint}',
3508
- headers
3509
- )
3510
-
3511
- if status_data.get('status') == 'completed':
3512
- if 'data' in status_data:
3513
- data = status_data['data']
3514
- while 'next' in status_data:
3515
- if len(status_data['data']) == 0:
3516
- break
3517
- next_url = status_data.get('next')
3518
- if not next_url:
3519
- logger.warning("Expected 'next' URL is missing.")
3520
- break
3521
- next_data = await self._async_get_request(next_url, headers)
3522
- data.extend(next_data.get('data', []))
3523
- status_data = next_data
3524
- status_data['data'] = data
3525
- # Create CrawlStatusResponse object from status data
3526
- response = CrawlStatusResponse(
3527
- status=status_data.get('status'),
3528
- total=status_data.get('total'),
3529
- completed=status_data.get('completed'),
3530
- creditsUsed=status_data.get('creditsUsed'),
3531
- expiresAt=status_data.get('expiresAt'),
3532
- data=status_data.get('data'),
3533
- success=False if 'error' in status_data else True
3534
- )
3535
-
3536
- if 'error' in status_data:
3537
- response.error = status_data.get('error')
3538
-
3539
- if 'next' in status_data:
3540
- response.next = status_data.get('next')
3541
-
3542
- return response
3543
-
3544
- async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3545
- """
3546
- Monitor the status of an asynchronous job until completion.
3547
-
3548
- Args:
3549
- id (str): The ID of the job to monitor
3550
- headers (Dict[str, str]): Headers to include in status check requests
3551
- poll_interval (int): Seconds between status checks (default: 2)
3552
-
3553
- Returns:
3554
- CrawlStatusResponse: The job results if completed successfully
3555
-
3556
- Raises:
3557
- Exception: If the job fails or an error occurs during status checks
3558
- """
3559
- while True:
3560
- status_data = await self._async_get_request(
3561
- f'{self.api_url}/v1/crawl/{id}',
3562
- headers
3563
- )
3564
-
3565
- if status_data.get('status') == 'completed':
3566
- if 'data' in status_data:
3567
- data = status_data['data']
3568
- while 'next' in status_data:
3569
- if len(status_data['data']) == 0:
3570
- break
3571
- next_url = status_data.get('next')
3572
- if not next_url:
3573
- logger.warning("Expected 'next' URL is missing.")
3574
- break
3575
- next_data = await self._async_get_request(next_url, headers)
3576
- data.extend(next_data.get('data', []))
3577
- status_data = next_data
3578
- status_data['data'] = data
3579
- return CrawlStatusResponse(**status_data)
3580
- else:
3581
- raise Exception('Job completed but no data was returned')
3582
- elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3583
- await asyncio.sleep(max(poll_interval, 2))
3584
- else:
3585
- raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3586
-
3587
- async def map_url(
3588
- self,
3589
- url: str,
3590
- *,
3591
- search: Optional[str] = None,
3592
- ignore_sitemap: Optional[bool] = None,
3593
- include_subdomains: Optional[bool] = None,
3594
- sitemap_only: Optional[bool] = None,
3595
- limit: Optional[int] = None,
3596
- timeout: Optional[int] = None,
3597
- params: Optional[MapParams] = None) -> MapResponse:
3598
- """
3599
- Asynchronously map and discover links from a URL.
3600
-
3601
- Args:
3602
- url (str): Target URL to map
3603
- params (Optional[MapParams]): See MapParams model:
3604
- Discovery Options:
3605
- * search - Filter pattern for URLs
3606
- * ignoreSitemap - Skip sitemap.xml
3607
- * includeSubdomains - Include subdomain links
3608
- * sitemapOnly - Only use sitemap.xml
3609
-
3610
- Limits:
3611
- * limit - Max URLs to return
3612
- * timeout - Request timeout (ms)
3613
-
3614
- Returns:
3615
- MapResponse with:
3616
- * Discovered URLs
3617
- * Success/error status
3618
-
3619
- Raises:
3620
- Exception: If mapping fails
3621
- """
3622
- map_params = {}
3623
- if params:
3624
- map_params.update(params.dict(exclude_none=True))
3625
-
3626
- # Add individual parameters
3627
- if search is not None:
3628
- map_params['search'] = search
3629
- if ignore_sitemap is not None:
3630
- map_params['ignoreSitemap'] = ignore_sitemap
3631
- if include_subdomains is not None:
3632
- map_params['includeSubdomains'] = include_subdomains
3633
- if sitemap_only is not None:
3634
- map_params['sitemapOnly'] = sitemap_only
3635
- if limit is not None:
3636
- map_params['limit'] = limit
3637
- if timeout is not None:
3638
- map_params['timeout'] = timeout
3639
-
3640
- # Create final params object
3641
- final_params = MapParams(**map_params)
3642
- params_dict = final_params.dict(exclude_none=True)
3643
- params_dict['url'] = url
3644
- params_dict['origin'] = f"python-sdk@{version}"
3645
-
3646
- # Make request
3647
- endpoint = f'/v1/map'
3648
- response = await self._async_post_request(
3649
- f'{self.api_url}{endpoint}',
3650
- params_dict,
3651
- headers={"Authorization": f"Bearer {self.api_key}"}
3652
- )
3653
-
3654
- if response.get('success') and 'links' in response:
3655
- return MapResponse(**response)
3656
- elif 'error' in response:
3657
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
3658
- else:
3659
- raise Exception(f'Failed to map URL. Error: {response}')
3660
-
3661
- async def extract(
3662
- self,
3663
- urls: Optional[List[str]] = None,
3664
- *,
3665
- prompt: Optional[str] = None,
3666
- schema: Optional[Any] = None,
3667
- system_prompt: Optional[str] = None,
3668
- allow_external_links: Optional[bool] = False,
3669
- enable_web_search: Optional[bool] = False,
3670
- show_sources: Optional[bool] = False,
3671
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3672
-
3673
- """
3674
- Asynchronously extract structured information from URLs.
3675
-
3676
- Args:
3677
- urls (Optional[List[str]]): URLs to extract from
3678
- prompt (Optional[str]): Custom extraction prompt
3679
- schema (Optional[Any]): JSON schema/Pydantic model
3680
- system_prompt (Optional[str]): System context
3681
- allow_external_links (Optional[bool]): Follow external links
3682
- enable_web_search (Optional[bool]): Enable web search
3683
- show_sources (Optional[bool]): Include source URLs
3684
- agent (Optional[Dict[str, Any]]): Agent configuration
3685
-
3686
- Returns:
3687
- ExtractResponse with:
3688
- * Structured data matching schema
3689
- * Source information if requested
3690
- * Success/error status
3691
-
3692
- Raises:
3693
- ValueError: If prompt/schema missing or extraction fails
3694
- """
3695
- headers = self._prepare_headers()
3696
-
3697
- if not prompt and not schema:
3698
- raise ValueError("Either prompt or schema is required")
3699
-
3700
- if not urls and not prompt:
3701
- raise ValueError("Either urls or prompt is required")
3702
-
3703
- if schema:
3704
- schema = self._ensure_schema_dict(schema)
3705
-
3706
- request_data = {
3707
- 'urls': urls or [],
3708
- 'allowExternalLinks': allow_external_links,
3709
- 'enableWebSearch': enable_web_search,
3710
- 'showSources': show_sources,
3711
- 'schema': schema,
3712
- 'origin': f'python-sdk@{get_version()}'
3713
- }
3714
-
3715
- # Only add prompt and systemPrompt if they exist
3716
- if prompt:
3717
- request_data['prompt'] = prompt
3718
- if system_prompt:
3719
- request_data['systemPrompt'] = system_prompt
3720
-
3721
- if agent:
3722
- request_data['agent'] = agent
3723
-
3724
- response = await self._async_post_request(
3725
- f'{self.api_url}/v1/extract',
3726
- request_data,
3727
- headers
3728
- )
3729
-
3730
- if response.get('success'):
3731
- job_id = response.get('id')
3732
- if not job_id:
3733
- raise Exception('Job ID not returned from extract request.')
3734
-
3735
- while True:
3736
- status_data = await self._async_get_request(
3737
- f'{self.api_url}/v1/extract/{job_id}',
3738
- headers
3739
- )
3740
-
3741
- if status_data['status'] == 'completed':
3742
- return ExtractResponse(**status_data)
3743
- elif status_data['status'] in ['failed', 'cancelled']:
3744
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3745
-
3746
- await asyncio.sleep(2)
3747
- else:
3748
- raise Exception(f'Failed to extract. Error: {response.get("error")}')
3749
-
3750
- async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3751
- """
3752
- Check the status of an asynchronous batch scrape job.
3753
-
3754
- Args:
3755
- id (str): The ID of the batch scrape job
3756
-
3757
- Returns:
3758
- BatchScrapeStatusResponse containing:
3759
- Status Information:
3760
- * status - Current state (scraping/completed/failed/cancelled)
3761
- * completed - Number of URLs scraped
3762
- * total - Total URLs to scrape
3763
- * creditsUsed - API credits consumed
3764
- * expiresAt - Data expiration timestamp
3765
-
3766
- Results:
3767
- * data - List of scraped documents
3768
- * next - URL for next page of results (if paginated)
3769
- * success - Whether status check succeeded
3770
- * error - Error message if failed
3771
-
3772
- Raises:
3773
- Exception: If status check fails
3774
- """
3775
- headers = self._prepare_headers()
3776
- endpoint = f'/v1/batch/scrape/{id}'
3777
-
3778
- status_data = await self._async_get_request(
3779
- f'{self.api_url}{endpoint}',
3780
- headers
3781
- )
3782
-
3783
- if status_data['status'] == 'completed':
3784
- if 'data' in status_data:
3785
- data = status_data['data']
3786
- while 'next' in status_data:
3787
- if len(status_data['data']) == 0:
3788
- break
3789
- next_url = status_data.get('next')
3790
- if not next_url:
3791
- logger.warning("Expected 'next' URL is missing.")
3792
- break
3793
- next_data = await self._async_get_request(next_url, headers)
3794
- data.extend(next_data.get('data', []))
3795
- status_data = next_data
3796
- status_data['data'] = data
3797
-
3798
- response = BatchScrapeStatusResponse(
3799
- status=status_data.get('status'),
3800
- total=status_data.get('total'),
3801
- completed=status_data.get('completed'),
3802
- creditsUsed=status_data.get('creditsUsed'),
3803
- expiresAt=status_data.get('expiresAt'),
3804
- data=status_data.get('data')
3805
- )
3806
-
3807
- if 'error' in status_data:
3808
- response['error'] = status_data['error']
3809
-
3810
- if 'next' in status_data:
3811
- response['next'] = status_data['next']
3812
-
3813
- return {
3814
- 'success': False if 'error' in status_data else True,
3815
- **response
3816
- }
3817
-
3818
- async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3819
- """
3820
- Get information about errors from an asynchronous batch scrape job.
3821
-
3822
- Args:
3823
- id (str): The ID of the batch scrape job
3824
-
3825
- Returns:
3826
- CrawlErrorsResponse containing:
3827
- errors (List[Dict[str, str]]): List of errors with fields:
3828
- * id (str): Error ID
3829
- * timestamp (str): When the error occurred
3830
- * url (str): URL that caused the error
3831
- * error (str): Error message
3832
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3833
-
3834
- Raises:
3835
- Exception: If error check fails
3836
- """
3837
- headers = self._prepare_headers()
3838
- return await self._async_get_request(
3839
- f'{self.api_url}/v1/batch/scrape/{id}/errors',
3840
- headers
3841
- )
3842
-
3843
- async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3844
- """
3845
- Get information about errors from an asynchronous crawl job.
3846
-
3847
- Args:
3848
- id (str): The ID of the crawl job
3849
-
3850
- Returns:
3851
- CrawlErrorsResponse containing:
3852
- * errors (List[Dict[str, str]]): List of errors with fields:
3853
- - id (str): Error ID
3854
- - timestamp (str): When the error occurred
3855
- - url (str): URL that caused the error
3856
- - error (str): Error message
3857
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3858
-
3859
- Raises:
3860
- Exception: If error check fails
3861
- """
3862
- headers = self._prepare_headers()
3863
- return await self._async_get_request(
3864
- f'{self.api_url}/v1/crawl/{id}/errors',
3865
- headers
3866
- )
3867
-
3868
- async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3869
- """
3870
- Cancel an asynchronous crawl job.
3871
-
3872
- Args:
3873
- id (str): The ID of the crawl job to cancel
3874
-
3875
- Returns:
3876
- Dict[str, Any] containing:
3877
- * success (bool): Whether cancellation was successful
3878
- * error (str, optional): Error message if cancellation failed
3879
-
3880
- Raises:
3881
- Exception: If cancellation fails
3882
- """
3883
- headers = self._prepare_headers()
3884
- async with aiohttp.ClientSession() as session:
3885
- async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3886
- return await response.json()
3887
-
3888
- async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3889
- """
3890
- Check the status of an asynchronous extraction job.
3891
-
3892
- Args:
3893
- job_id (str): The ID of the extraction job
3894
-
3895
- Returns:
3896
- ExtractResponse[Any] with:
3897
- * success (bool): Whether request succeeded
3898
- * data (Optional[Any]): Extracted data matching schema
3899
- * error (Optional[str]): Error message if any
3900
- * warning (Optional[str]): Warning message if any
3901
- * sources (Optional[List[str]]): Source URLs if requested
3902
-
3903
- Raises:
3904
- ValueError: If status check fails
3905
- """
3906
- headers = self._prepare_headers()
3907
- try:
3908
- return await self._async_get_request(
3909
- f'{self.api_url}/v1/extract/{job_id}',
3910
- headers
3911
- )
3912
- except Exception as e:
3913
- raise ValueError(str(e))
3914
-
3915
- async def async_extract(
3916
- self,
3917
- urls: Optional[List[str]] = None,
3918
- *,
3919
- prompt: Optional[str] = None,
3920
- schema: Optional[Any] = None,
3921
- system_prompt: Optional[str] = None,
3922
- allow_external_links: Optional[bool] = False,
3923
- enable_web_search: Optional[bool] = False,
3924
- show_sources: Optional[bool] = False,
3925
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3926
- """
3927
- Initiate an asynchronous extraction job without waiting for completion.
3928
-
3929
- Args:
3930
- urls (Optional[List[str]]): URLs to extract from
3931
- prompt (Optional[str]): Custom extraction prompt
3932
- schema (Optional[Any]): JSON schema/Pydantic model
3933
- system_prompt (Optional[str]): System context
3934
- allow_external_links (Optional[bool]): Follow external links
3935
- enable_web_search (Optional[bool]): Enable web search
3936
- show_sources (Optional[bool]): Include source URLs
3937
- agent (Optional[Dict[str, Any]]): Agent configuration
3938
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3939
-
3940
- Returns:
3941
- ExtractResponse[Any] with:
3942
- * success (bool): Whether request succeeded
3943
- * data (Optional[Any]): Extracted data matching schema
3944
- * error (Optional[str]): Error message if any
3945
-
3946
- Raises:
3947
- ValueError: If job initiation fails
3948
- """
3949
- headers = self._prepare_headers()
3950
-
3951
- if not prompt and not schema:
3952
- raise ValueError("Either prompt or schema is required")
3953
-
3954
- if not urls and not prompt:
3955
- raise ValueError("Either urls or prompt is required")
3956
-
3957
- if schema:
3958
- schema = self._ensure_schema_dict(schema)
3959
-
3960
- request_data = ExtractResponse(
3961
- urls=urls or [],
3962
- allowExternalLinks=allow_external_links,
3963
- enableWebSearch=enable_web_search,
3964
- showSources=show_sources,
3965
- schema=schema,
3966
- origin=f'python-sdk@{version}'
3967
- )
3968
-
3969
- if prompt:
3970
- request_data['prompt'] = prompt
3971
- if system_prompt:
3972
- request_data['systemPrompt'] = system_prompt
3973
- if agent:
3974
- request_data['agent'] = agent
3975
-
3976
- try:
3977
- return await self._async_post_request(
3978
- f'{self.api_url}/v1/extract',
3979
- request_data,
3980
- headers
3981
- )
3982
- except Exception as e:
3983
- raise ValueError(str(e))
3984
-
3985
- async def generate_llms_text(
3986
- self,
3987
- url: str,
3988
- *,
3989
- max_urls: Optional[int] = None,
3990
- show_full_text: Optional[bool] = None,
3991
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3992
- """
3993
- Generate LLMs.txt for a given URL and monitor until completion.
3994
-
3995
- Args:
3996
- url (str): Target URL to generate LLMs.txt from
3997
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
3998
- show_full_text (Optional[bool]): Include full text in output (default: False)
3999
- experimental_stream (Optional[bool]): Enable experimental streaming
4000
-
4001
- Returns:
4002
- GenerateLLMsTextStatusResponse containing:
4003
- * success (bool): Whether generation completed successfully
4004
- * status (str): Status of generation (processing/completed/failed)
4005
- * data (Dict[str, str], optional): Generated text with fields:
4006
- - llmstxt (str): Generated LLMs.txt content
4007
- - llmsfulltxt (str, optional): Full version if requested
4008
- * error (str, optional): Error message if generation failed
4009
- * expiresAt (str): When the generated data expires
4010
-
4011
- Raises:
4012
- Exception: If generation fails
4013
- """
4014
- params = {}
4015
- if max_urls is not None:
4016
- params['maxUrls'] = max_urls
4017
- if show_full_text is not None:
4018
- params['showFullText'] = show_full_text
4019
- if experimental_stream is not None:
4020
- params['__experimental_stream'] = experimental_stream
4021
-
4022
- response = await self.async_generate_llms_text(
4023
- url,
4024
- max_urls=max_urls,
4025
- show_full_text=show_full_text,
4026
- cache=cache,
4027
- experimental_stream=experimental_stream
4028
- )
4029
- if not response.get('success') or 'id' not in response:
4030
- return response
4031
-
4032
- job_id = response['id']
4033
- while True:
4034
- status = await self.check_generate_llms_text_status(job_id)
4035
-
4036
- if status['status'] == 'completed':
4037
- return status
4038
- elif status['status'] == 'failed':
4039
- raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4040
- elif status['status'] != 'processing':
4041
- break
4042
-
4043
- await asyncio.sleep(2)
4044
-
4045
- return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4046
-
4047
- async def async_generate_llms_text(
4048
- self,
4049
- url: str,
4050
- *,
4051
- max_urls: Optional[int] = None,
4052
- show_full_text: Optional[bool] = None,
4053
- cache: Optional[bool] = None,
4054
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4055
- """
4056
- Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4057
-
4058
- Args:
4059
- url (str): Target URL to generate LLMs.txt from
4060
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
4061
- show_full_text (Optional[bool]): Include full text in output (default: False)
4062
- cache (Optional[bool]): Whether to use cached content if available (default: True)
4063
- experimental_stream (Optional[bool]): Enable experimental streaming
4064
-
4065
- Returns:
4066
- GenerateLLMsTextResponse containing:
4067
- * success (bool): Whether job started successfully
4068
- * id (str): Unique identifier for the job
4069
- * error (str, optional): Error message if start failed
4070
-
4071
- Raises:
4072
- ValueError: If job initiation fails
4073
- """
4074
- params = {}
4075
- if max_urls is not None:
4076
- params['maxUrls'] = max_urls
4077
- if show_full_text is not None:
4078
- params['showFullText'] = show_full_text
4079
- if experimental_stream is not None:
4080
- params['__experimental_stream'] = experimental_stream
4081
-
4082
- params = GenerateLLMsTextParams(
4083
- maxUrls=max_urls,
4084
- showFullText=show_full_text,
4085
- cache=cache,
4086
- __experimental_stream=experimental_stream
4087
- )
4088
-
4089
- headers = self._prepare_headers()
4090
- json_data = {'url': url, **params.dict(exclude_none=True)}
4091
- json_data['origin'] = f"python-sdk@{version}"
4092
-
4093
- try:
4094
- return await self._async_post_request(
4095
- f'{self.api_url}/v1/llmstxt',
4096
- json_data,
4097
- headers
4098
- )
4099
- except Exception as e:
4100
- raise ValueError(str(e))
4101
-
4102
- async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4103
- """
4104
- Check the status of an asynchronous LLMs.txt generation job.
4105
-
4106
- Args:
4107
- id (str): The ID of the generation job
4108
-
4109
- Returns:
4110
- GenerateLLMsTextStatusResponse containing:
4111
- * success (bool): Whether generation completed successfully
4112
- * status (str): Status of generation (processing/completed/failed)
4113
- * data (Dict[str, str], optional): Generated text with fields:
4114
- - llmstxt (str): Generated LLMs.txt content
4115
- - llmsfulltxt (str, optional): Full version if requested
4116
- * error (str, optional): Error message if generation failed
4117
- * expiresAt (str): When the generated data expires
4118
-
4119
- Raises:
4120
- ValueError: If status check fails
4121
- """
4122
- headers = self._prepare_headers()
4123
- try:
4124
- return await self._async_get_request(
4125
- f'{self.api_url}/v1/llmstxt/{id}',
4126
- headers
4127
- )
4128
- except Exception as e:
4129
- raise ValueError(str(e))
4130
-
4131
- async def deep_research(
4132
- self,
4133
- query: str,
4134
- *,
4135
- max_depth: Optional[int] = None,
4136
- time_limit: Optional[int] = None,
4137
- max_urls: Optional[int] = None,
4138
- analysis_prompt: Optional[str] = None,
4139
- system_prompt: Optional[str] = None,
4140
- __experimental_stream_steps: Optional[bool] = None,
4141
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4142
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4143
- """
4144
- Initiates a deep research operation on a given query and polls until completion.
4145
-
4146
- Args:
4147
- query (str): Research query or topic to investigate
4148
- max_depth (Optional[int]): Maximum depth of research exploration
4149
- time_limit (Optional[int]): Time limit in seconds for research
4150
- max_urls (Optional[int]): Maximum number of URLs to process
4151
- analysis_prompt (Optional[str]): Custom prompt for analysis
4152
- system_prompt (Optional[str]): Custom system prompt
4153
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4154
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4155
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4156
-
4157
- Returns:
4158
- DeepResearchStatusResponse containing:
4159
- * success (bool): Whether research completed successfully
4160
- * status (str): Current state (processing/completed/failed)
4161
- * error (Optional[str]): Error message if failed
4162
- * id (str): Unique identifier for the research job
4163
- * data (Any): Research findings and analysis
4164
- * sources (List[Dict]): List of discovered sources
4165
- * activities (List[Dict]): Research progress log
4166
- * summaries (List[str]): Generated research summaries
4167
-
4168
- Raises:
4169
- Exception: If research fails
4170
- """
4171
- research_params = {}
4172
- if max_depth is not None:
4173
- research_params['maxDepth'] = max_depth
4174
- if time_limit is not None:
4175
- research_params['timeLimit'] = time_limit
4176
- if max_urls is not None:
4177
- research_params['maxUrls'] = max_urls
4178
- if analysis_prompt is not None:
4179
- research_params['analysisPrompt'] = analysis_prompt
4180
- if system_prompt is not None:
4181
- research_params['systemPrompt'] = system_prompt
4182
- if __experimental_stream_steps is not None:
4183
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4184
- research_params = DeepResearchParams(**research_params)
4185
-
4186
- response = await self.async_deep_research(
4187
- query,
4188
- max_depth=max_depth,
4189
- time_limit=time_limit,
4190
- max_urls=max_urls,
4191
- analysis_prompt=analysis_prompt,
4192
- system_prompt=system_prompt
4193
- )
4194
- if not response.get('success') or 'id' not in response:
4195
- return response
4196
-
4197
- job_id = response['id']
4198
- last_activity_count = 0
4199
- last_source_count = 0
4200
-
4201
- while True:
4202
- status = await self.check_deep_research_status(job_id)
4203
-
4204
- if on_activity and 'activities' in status:
4205
- new_activities = status['activities'][last_activity_count:]
4206
- for activity in new_activities:
4207
- on_activity(activity)
4208
- last_activity_count = len(status['activities'])
4209
-
4210
- if on_source and 'sources' in status:
4211
- new_sources = status['sources'][last_source_count:]
4212
- for source in new_sources:
4213
- on_source(source)
4214
- last_source_count = len(status['sources'])
4215
-
4216
- if status['status'] == 'completed':
4217
- return status
4218
- elif status['status'] == 'failed':
4219
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
4220
- elif status['status'] != 'processing':
4221
- break
4222
-
4223
- await asyncio.sleep(2)
4224
-
4225
- return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4226
-
4227
- async def async_deep_research(
4228
- self,
4229
- query: str,
4230
- *,
4231
- max_depth: Optional[int] = None,
4232
- time_limit: Optional[int] = None,
4233
- max_urls: Optional[int] = None,
4234
- analysis_prompt: Optional[str] = None,
4235
- system_prompt: Optional[str] = None,
4236
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4237
- """
4238
- Initiates an asynchronous deep research operation.
4239
-
4240
- Args:
4241
- query (str): Research query or topic to investigate
4242
- max_depth (Optional[int]): Maximum depth of research exploration
4243
- time_limit (Optional[int]): Time limit in seconds for research
4244
- max_urls (Optional[int]): Maximum number of URLs to process
4245
- analysis_prompt (Optional[str]): Custom prompt for analysis
4246
- system_prompt (Optional[str]): Custom system prompt
4247
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4248
-
4249
- Returns:
4250
- Dict[str, Any]: A response containing:
4251
- * success (bool): Whether the research initiation was successful
4252
- * id (str): The unique identifier for the research job
4253
- * error (str, optional): Error message if initiation failed
4254
-
4255
- Raises:
4256
- Exception: If the research initiation fails.
4257
- """
4258
- research_params = {}
4259
- if max_depth is not None:
4260
- research_params['maxDepth'] = max_depth
4261
- if time_limit is not None:
4262
- research_params['timeLimit'] = time_limit
4263
- if max_urls is not None:
4264
- research_params['maxUrls'] = max_urls
4265
- if analysis_prompt is not None:
4266
- research_params['analysisPrompt'] = analysis_prompt
4267
- if system_prompt is not None:
4268
- research_params['systemPrompt'] = system_prompt
4269
- if __experimental_stream_steps is not None:
4270
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4271
- research_params = DeepResearchParams(**research_params)
4272
-
4273
- headers = self._prepare_headers()
4274
-
4275
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
4276
- json_data['origin'] = f"python-sdk@{version}"
4277
-
4278
- try:
4279
- return await self._async_post_request(
4280
- f'{self.api_url}/v1/deep-research',
4281
- json_data,
4282
- headers
4283
- )
4284
- except Exception as e:
4285
- raise ValueError(str(e))
4286
-
4287
- async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4288
- """
4289
- Check the status of a deep research operation.
4290
-
4291
- Args:
4292
- id (str): The ID of the deep research operation.
4293
-
4294
- Returns:
4295
- DeepResearchResponse containing:
4296
-
4297
- Status:
4298
- * success - Whether research completed successfully
4299
- * status - Current state (processing/completed/failed)
4300
- * error - Error message if failed
4301
-
4302
- Results:
4303
- * id - Unique identifier for the research job
4304
- * data - Research findings and analysis
4305
- * sources - List of discovered sources
4306
- * activities - Research progress log
4307
- * summaries - Generated research summaries
4308
-
4309
- Raises:
4310
- Exception: If the status check fails.
4311
- """
4312
- headers = self._prepare_headers()
4313
- try:
4314
- return await self._async_get_request(
4315
- f'{self.api_url}/v1/deep-research/{id}',
4316
- headers
4317
- )
4318
- except Exception as e:
4319
- raise ValueError(str(e))
4320
-
4321
- async def search(
4322
- self,
4323
- query: str,
4324
- *,
4325
- limit: Optional[int] = None,
4326
- tbs: Optional[str] = None,
4327
- filter: Optional[str] = None,
4328
- lang: Optional[str] = None,
4329
- country: Optional[str] = None,
4330
- location: Optional[str] = None,
4331
- timeout: Optional[int] = None,
4332
- scrape_options: Optional[ScrapeOptions] = None,
4333
- params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4334
- **kwargs) -> SearchResponse:
4335
- """
4336
- Asynchronously search for content using Firecrawl.
4337
-
4338
- Args:
4339
- query (str): Search query string
4340
- limit (Optional[int]): Max results (default: 5)
4341
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
4342
- filter (Optional[str]): Custom result filter
4343
- lang (Optional[str]): Language code (default: "en")
4344
- country (Optional[str]): Country code (default: "us")
4345
- location (Optional[str]): Geo-targeting
4346
- timeout (Optional[int]): Request timeout in milliseconds
4347
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4348
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4349
- **kwargs: Additional keyword arguments for future compatibility
4350
-
4351
- Returns:
4352
- SearchResponse: Response containing:
4353
- * success (bool): Whether request succeeded
4354
- * data (List[FirecrawlDocument]): Search results
4355
- * warning (Optional[str]): Warning message if any
4356
- * error (Optional[str]): Error message if any
4357
-
4358
- Raises:
4359
- Exception: If search fails or response cannot be parsed
4360
- """
4361
- # Build search parameters
4362
- search_params = {}
4363
- if params:
4364
- if isinstance(params, dict):
4365
- search_params.update(params)
4366
- else:
4367
- search_params.update(params.dict(exclude_none=True))
4368
-
4369
- # Add individual parameters
4370
- if limit is not None:
4371
- search_params['limit'] = limit
4372
- if tbs is not None:
4373
- search_params['tbs'] = tbs
4374
- if filter is not None:
4375
- search_params['filter'] = filter
4376
- if lang is not None:
4377
- search_params['lang'] = lang
4378
- if country is not None:
4379
- search_params['country'] = country
4380
- if location is not None:
4381
- search_params['location'] = location
4382
- if timeout is not None:
4383
- search_params['timeout'] = timeout
4384
- if scrape_options is not None:
4385
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4386
-
4387
- # Add any additional kwargs
4388
- search_params.update(kwargs)
4389
-
4390
- # Create final params object
4391
- final_params = SearchParams(query=query, **search_params)
4392
- params_dict = final_params.dict(exclude_none=True)
4393
- params_dict['origin'] = f"python-sdk@{version}"
4394
-
4395
- return await self._async_post_request(
4396
- f"{self.api_url}/v1/search",
4397
- params_dict,
4398
- {"Authorization": f"Bearer {self.api_key}"}
4399
- )
4400
-
4401
- class AsyncCrawlWatcher(CrawlWatcher):
4402
- """
4403
- Async version of CrawlWatcher that properly handles async operations.
4404
- """
4405
- def __init__(self, id: str, app: AsyncFirecrawlApp):
4406
- super().__init__(id, app)
4407
-
4408
- async def connect(self) -> None:
4409
- """
4410
- Establishes async WebSocket connection and starts listening for messages.
4411
- """
4412
- async with websockets.connect(
4413
- self.ws_url,
4414
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4415
- ) as websocket:
4416
- await self._listen(websocket)
4417
-
4418
- async def _listen(self, websocket) -> None:
4419
- """
4420
- Listens for incoming WebSocket messages and handles them asynchronously.
4421
-
4422
- Args:
4423
- websocket: The WebSocket connection object
4424
- """
4425
- async for message in websocket:
4426
- msg = json.loads(message)
4427
- await self._handle_message(msg)
4428
-
4429
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
4430
- """
4431
- Handles incoming WebSocket messages based on their type asynchronously.
4432
-
4433
- Args:
4434
- msg (Dict[str, Any]): The message to handle
4435
- """
4436
- if msg['type'] == 'done':
4437
- self.status = 'completed'
4438
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4439
- elif msg['type'] == 'error':
4440
- self.status = 'failed'
4441
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4442
- elif msg['type'] == 'catchup':
4443
- self.status = msg['data']['status']
4444
- self.data.extend(msg['data'].get('data', []))
4445
- for doc in self.data:
4446
- self.dispatch_event('document', {'data': doc, 'id': self.id})
4447
- elif msg['type'] == 'document':
4448
- self.data.append(msg['data'])
4449
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4450
-
4451
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4452
- """
4453
- Handle errors from async API responses.
4454
- """
4455
- try:
4456
- error_data = await response.json()
4457
- error_message = error_data.get('error', 'No error message provided.')
4458
- error_details = error_data.get('details', 'No additional error details provided.')
4459
- except:
4460
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4461
-
4462
- # Use the app's method to get the error message
4463
- message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4464
-
4465
- raise aiohttp.ClientError(message)
4466
-
4467
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4468
- """
4469
- Generate a standardized error message based on HTTP status code for async operations.
4470
-
4471
- Args:
4472
- status_code (int): The HTTP status code from the response
4473
- action (str): Description of the action that was being performed
4474
- error_message (str): The error message from the API response
4475
- error_details (str): Additional error details from the API response
4476
-
4477
- Returns:
4478
- str: A formatted error message
4479
- """
4480
- return self._get_error_message(status_code, action, error_message, error_details)