firecrawl 1.17.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/firecrawl.py CHANGED
@@ -12,15 +12,293 @@ Classes:
12
12
  import logging
13
13
  import os
14
14
  import time
15
- from typing import Any, Dict, Optional, List, Union, Callable
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
16
  import json
17
-
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
18
20
  import requests
19
21
  import pydantic
20
22
  import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
21
47
 
22
48
  logger : logging.Logger = logging.getLogger("firecrawl")
23
49
 
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+
100
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
101
+ """Document retrieved or processed by Firecrawl."""
102
+ url: Optional[str] = None
103
+ markdown: Optional[str] = None
104
+ html: Optional[str] = None
105
+ rawHtml: Optional[str] = None
106
+ links: Optional[List[str]] = None
107
+ extract: Optional[T] = None
108
+ json: Optional[T] = None
109
+ screenshot: Optional[str] = None
110
+ metadata: Optional[Any] = None
111
+ actions: Optional[ActionsResult] = None
112
+ title: Optional[str] = None # v1 search only
113
+ description: Optional[str] = None # v1 search only
114
+
115
+ class LocationConfig(pydantic.BaseModel):
116
+ """Location configuration for scraping."""
117
+ country: Optional[str] = None
118
+ languages: Optional[List[str]] = None
119
+
120
+ class WebhookConfig(pydantic.BaseModel):
121
+ """Configuration for webhooks."""
122
+ url: str
123
+ headers: Optional[Dict[str, str]] = None
124
+ metadata: Optional[Dict[str, str]] = None
125
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
126
+
127
+ class CommonOptions(pydantic.BaseModel):
128
+ """Parameters for scraping operations."""
129
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
130
+ headers: Optional[Dict[str, str]] = None
131
+ includeTags: Optional[List[str]] = None
132
+ excludeTags: Optional[List[str]] = None
133
+ onlyMainContent: Optional[bool] = None
134
+ waitFor: Optional[int] = None
135
+ timeout: Optional[int] = None
136
+ location: Optional[LocationConfig] = None
137
+ mobile: Optional[bool] = None
138
+ skipTlsVerification: Optional[bool] = None
139
+ removeBase64Images: Optional[bool] = None
140
+ blockAds: Optional[bool] = None
141
+ proxy: Optional[Literal["basic", "stealth"]] = None
142
+
143
+ class WaitAction(pydantic.BaseModel):
144
+ """Wait action to perform during scraping."""
145
+ type: Literal["wait"]
146
+ milliseconds: int
147
+ selector: Optional[str] = None
148
+
149
+ class ScreenshotAction(pydantic.BaseModel):
150
+ """Screenshot action to perform during scraping."""
151
+ type: Literal["screenshot"]
152
+ fullPage: Optional[bool] = None
153
+
154
+ class ClickAction(pydantic.BaseModel):
155
+ """Click action to perform during scraping."""
156
+ type: Literal["click"]
157
+ selector: str
158
+
159
+ class WriteAction(pydantic.BaseModel):
160
+ """Write action to perform during scraping."""
161
+ type: Literal["write"]
162
+ text: str
163
+
164
+ class PressAction(pydantic.BaseModel):
165
+ """Press action to perform during scraping."""
166
+ type: Literal["press"]
167
+ key: str
168
+
169
+ class ScrollAction(pydantic.BaseModel):
170
+ """Scroll action to perform during scraping."""
171
+ type: Literal["scroll"]
172
+ direction: Literal["up", "down"]
173
+ selector: Optional[str] = None
174
+
175
+ class ScrapeAction(pydantic.BaseModel):
176
+ """Scrape action to perform during scraping."""
177
+ type: Literal["scrape"]
178
+
179
+ class ExecuteJavascriptAction(pydantic.BaseModel):
180
+ """Execute javascript action to perform during scraping."""
181
+ type: Literal["executeJavascript"]
182
+ script: str
183
+
184
+
185
+ class ExtractAgent(pydantic.BaseModel):
186
+ """Configuration for the agent in extract operations."""
187
+ model: Literal["FIRE-1"] = "FIRE-1"
188
+
189
+ class ExtractConfig(pydantic.BaseModel):
190
+ """Configuration for extraction."""
191
+ prompt: Optional[str] = None
192
+ schema: Optional[Any] = None
193
+ systemPrompt: Optional[str] = None
194
+ agent: Optional[ExtractAgent] = None
195
+
196
+ class ScrapeParams(CommonOptions):
197
+ """Parameters for scraping operations."""
198
+ extract: Optional[ExtractConfig] = None
199
+ jsonOptions: Optional[ExtractConfig] = None
200
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
201
+ agent: Optional[AgentOptions] = None
202
+
203
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
204
+ """Response from scraping operations."""
205
+ success: bool = True
206
+ warning: Optional[str] = None
207
+ error: Optional[str] = None
208
+
209
+ class BatchScrapeResponse(pydantic.BaseModel):
210
+ """Response from batch scrape operations."""
211
+ id: Optional[str] = None
212
+ url: Optional[str] = None
213
+ success: bool = True
214
+ error: Optional[str] = None
215
+ invalidURLs: Optional[List[str]] = None
216
+
217
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
218
+ """Response from batch scrape status checks."""
219
+ success: bool = True
220
+ status: Literal["scraping", "completed", "failed", "cancelled"]
221
+ completed: int
222
+ total: int
223
+ creditsUsed: int
224
+ expiresAt: datetime
225
+ next: Optional[str] = None
226
+ data: List[FirecrawlDocument]
227
+
228
+ class CrawlParams(pydantic.BaseModel):
229
+ """Parameters for crawling operations."""
230
+ includePaths: Optional[List[str]] = None
231
+ excludePaths: Optional[List[str]] = None
232
+ maxDepth: Optional[int] = None
233
+ maxDiscoveryDepth: Optional[int] = None
234
+ limit: Optional[int] = None
235
+ allowBackwardLinks: Optional[bool] = None
236
+ allowExternalLinks: Optional[bool] = None
237
+ ignoreSitemap: Optional[bool] = None
238
+ scrapeOptions: Optional[CommonOptions] = None
239
+ webhook: Optional[Union[str, WebhookConfig]] = None
240
+ deduplicateSimilarURLs: Optional[bool] = None
241
+ ignoreQueryParameters: Optional[bool] = None
242
+ regexOnFullURL: Optional[bool] = None
243
+
244
+ class CrawlResponse(pydantic.BaseModel):
245
+ """Response from crawling operations."""
246
+ id: Optional[str] = None
247
+ url: Optional[str] = None
248
+ success: bool = True
249
+ error: Optional[str] = None
250
+
251
+ class CrawlStatusResponse(pydantic.BaseModel):
252
+ """Response from crawl status checks."""
253
+ success: bool = True
254
+ status: Literal["scraping", "completed", "failed", "cancelled"]
255
+ completed: int
256
+ total: int
257
+ creditsUsed: int
258
+ expiresAt: datetime
259
+ next: Optional[str] = None
260
+ data: List[FirecrawlDocument]
261
+
262
+ class CrawlErrorsResponse(pydantic.BaseModel):
263
+ """Response from crawl/batch scrape error monitoring."""
264
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
265
+ robotsBlocked: List[str]
266
+
267
+ class MapParams(pydantic.BaseModel):
268
+ """Parameters for mapping operations."""
269
+ search: Optional[str] = None
270
+ ignoreSitemap: Optional[bool] = None
271
+ includeSubdomains: Optional[bool] = None
272
+ sitemapOnly: Optional[bool] = None
273
+ limit: Optional[int] = None
274
+ timeout: Optional[int] = None
275
+
276
+ class MapResponse(pydantic.BaseModel):
277
+ """Response from mapping operations."""
278
+ success: bool = True
279
+ links: Optional[List[str]] = None
280
+ error: Optional[str] = None
281
+
282
+ class ExtractParams(pydantic.BaseModel):
283
+ """Parameters for extracting information from URLs."""
284
+ prompt: Optional[str] = None
285
+ schema: Optional[Any] = None
286
+ systemPrompt: Optional[str] = None
287
+ allowExternalLinks: Optional[bool] = None
288
+ enableWebSearch: Optional[bool] = None
289
+ includeSubdomains: Optional[bool] = None
290
+ origin: Optional[str] = None
291
+ showSources: Optional[bool] = None
292
+ scrapeOptions: Optional[CommonOptions] = None
293
+
294
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
295
+ """Response from extract operations."""
296
+ success: bool = True
297
+ data: Optional[T] = None
298
+ error: Optional[str] = None
299
+ warning: Optional[str] = None
300
+ sources: Optional[List[str]] = None
301
+
24
302
  class SearchParams(pydantic.BaseModel):
25
303
  query: str
26
304
  limit: Optional[int] = 5
@@ -31,7 +309,14 @@ class SearchParams(pydantic.BaseModel):
31
309
  location: Optional[str] = None
32
310
  origin: Optional[str] = "api"
33
311
  timeout: Optional[int] = 60000
34
- scrapeOptions: Optional[Dict[str, Any]] = None
312
+ scrapeOptions: Optional[CommonOptions] = None
313
+
314
+ class SearchResponse(pydantic.BaseModel):
315
+ """Response from search operations."""
316
+ success: bool = True
317
+ data: List[FirecrawlDocument]
318
+ warning: Optional[str] = None
319
+ error: Optional[str] = None
35
320
 
36
321
  class GenerateLLMsTextParams(pydantic.BaseModel):
37
322
  """
@@ -75,6 +360,24 @@ class DeepResearchStatusResponse(pydantic.BaseModel):
75
360
  sources: List[Dict[str, Any]]
76
361
  summaries: List[str]
77
362
 
363
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
364
+ """Response from LLMs.txt generation operations."""
365
+ success: bool = True
366
+ id: str
367
+ error: Optional[str] = None
368
+
369
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
370
+ llmstxt: str
371
+ llmsfulltxt: Optional[str] = None
372
+
373
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
374
+ """Status response from LLMs.txt generation operations."""
375
+ success: bool = True
376
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
377
+ status: Literal["processing", "completed", "failed"]
378
+ error: Optional[str] = None
379
+ expiresAt: str
380
+
78
381
  class ChangeTrackingData(pydantic.BaseModel):
79
382
  """
80
383
  Data for the change tracking format.
@@ -84,42 +387,39 @@ class ChangeTrackingData(pydantic.BaseModel):
84
387
  visibility: str # "visible" | "hidden"
85
388
  diff: Optional[Dict[str, Any]] = None
86
389
  json: Optional[Any] = None
390
+
391
+ class SearchResponse(pydantic.BaseModel):
392
+ """
393
+ Response from the search operation.
394
+ """
395
+ success: bool
396
+ data: List[Dict[str, Any]]
397
+ warning: Optional[str] = None
398
+ error: Optional[str] = None
87
399
 
88
- class FirecrawlApp:
89
- class SearchResponse(pydantic.BaseModel):
90
- """
91
- Response from the search operation.
92
- """
93
- success: bool
94
- data: List[Dict[str, Any]]
95
- warning: Optional[str] = None
96
- error: Optional[str] = None
97
-
98
- class ExtractParams(pydantic.BaseModel):
99
- """
100
- Parameters for the extract operation.
101
- """
102
- prompt: Optional[str] = None
103
- schema_: Optional[Any] = pydantic.Field(None, alias='schema')
104
- system_prompt: Optional[str] = None
105
- allow_external_links: Optional[bool] = False
106
- enable_web_search: Optional[bool] = False
107
- # Just for backwards compatibility
108
- enableWebSearch: Optional[bool] = False
109
- show_sources: Optional[bool] = False
110
- agent: Optional[Dict[str, Any]] = None
111
-
112
-
113
-
114
-
115
- class ExtractResponse(pydantic.BaseModel):
116
- """
117
- Response from the extract operation.
118
- """
119
- success: bool
120
- data: Optional[Any] = None
121
- error: Optional[str] = None
400
+ class ExtractParams(pydantic.BaseModel):
401
+ """
402
+ Parameters for the extract operation.
403
+ """
404
+ prompt: Optional[str] = None
405
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
406
+ system_prompt: Optional[str] = None
407
+ allow_external_links: Optional[bool] = False
408
+ enable_web_search: Optional[bool] = False
409
+ # Just for backwards compatibility
410
+ enableWebSearch: Optional[bool] = False
411
+ show_sources: Optional[bool] = False
412
+ agent: Optional[Dict[str, Any]] = None
413
+
414
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
415
+ """
416
+ Response from the extract operation.
417
+ """
418
+ success: bool
419
+ data: Optional[T] = None
420
+ error: Optional[str] = None
122
421
 
422
+ class FirecrawlApp:
123
423
  def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
124
424
  """
125
425
  Initialize the FirecrawlApp instance with API key, API URL.
@@ -138,200 +438,451 @@ class FirecrawlApp:
138
438
 
139
439
  logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
140
440
 
141
- def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
441
+ def scrape_url(
442
+ self,
443
+ url: str,
444
+ *,
445
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
446
+ include_tags: Optional[List[str]] = None,
447
+ exclude_tags: Optional[List[str]] = None,
448
+ only_main_content: Optional[bool] = None,
449
+ wait_for: Optional[int] = None,
450
+ timeout: Optional[int] = None,
451
+ location: Optional[LocationConfig] = None,
452
+ mobile: Optional[bool] = None,
453
+ skip_tls_verification: Optional[bool] = None,
454
+ remove_base64_images: Optional[bool] = None,
455
+ block_ads: Optional[bool] = None,
456
+ proxy: Optional[Literal["basic", "stealth"]] = None,
457
+ extract: Optional[ExtractConfig] = None,
458
+ json_options: Optional[ExtractConfig] = None,
459
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
460
+ **kwargs) -> ScrapeResponse[Any]:
142
461
  """
143
- Scrape the specified URL using the Firecrawl API.
462
+ Scrape and extract content from a URL.
144
463
 
145
464
  Args:
146
- url (str): The URL to scrape.
147
- params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
465
+ url (str): Target URL to scrape
466
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
467
+ include_tags (Optional[List[str]]): HTML tags to include
468
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
469
+ only_main_content (Optional[bool]): Extract main content only
470
+ wait_for (Optional[int]): Wait for a specific element to appear
471
+ timeout (Optional[int]): Request timeout (ms)
472
+ location (Optional[LocationConfig]): Location configuration
473
+ mobile (Optional[bool]): Use mobile user agent
474
+ skip_tls_verification (Optional[bool]): Skip TLS verification
475
+ remove_base64_images (Optional[bool]): Remove base64 images
476
+ block_ads (Optional[bool]): Block ads
477
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
478
+ extract (Optional[ExtractConfig]): Content extraction settings
479
+ json_options (Optional[ExtractConfig]): JSON extraction settings
480
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
481
+
148
482
 
149
483
  Returns:
150
- Any: The scraped data if the request is successful.
484
+ ScrapeResponse with:
485
+ * Requested content formats
486
+ * Page metadata
487
+ * Extraction results
488
+ * Success/error status
151
489
 
152
490
  Raises:
153
- Exception: If the scrape request fails.
491
+ Exception: If scraping fails
154
492
  """
155
-
156
493
  headers = self._prepare_headers()
157
494
 
158
- # Prepare the base scrape parameters with the URL
159
- scrape_params = {'url': url}
160
-
161
- # If there are additional params, process them
162
- if params:
163
- # Handle extract (for v1)
164
- extract = params.get('extract', {})
165
- if extract:
166
- if 'schema' in extract and hasattr(extract['schema'], 'schema'):
167
- extract['schema'] = extract['schema'].schema()
168
- scrape_params['extract'] = extract
169
-
170
- # Include any other params directly at the top level of scrape_params
171
- for key, value in params.items():
172
- if key not in ['extract']:
173
- scrape_params[key] = value
174
-
175
- json = params.get("jsonOptions", {})
176
- if json:
177
- if 'schema' in json and hasattr(json['schema'], 'schema'):
178
- json['schema'] = json['schema'].schema()
179
- scrape_params['jsonOptions'] = json
180
-
181
- change_tracking = params.get("changeTrackingOptions", {})
182
- if change_tracking:
183
- scrape_params['changeTrackingOptions'] = change_tracking
184
-
185
- # Include any other params directly at the top level of scrape_params
186
- for key, value in params.items():
187
- if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']:
188
- scrape_params[key] = value
189
-
190
- agent = params.get('agent')
191
- if agent:
192
- scrape_params['agent'] = agent
193
-
495
+ # Build scrape parameters
496
+ scrape_params = {
497
+ 'url': url,
498
+ 'origin': f"python-sdk@{version}"
499
+ }
194
500
 
195
- endpoint = f'/v1/scrape'
196
- # Make the POST request with the prepared headers and JSON data
501
+ # Add optional parameters if provided
502
+ if formats:
503
+ scrape_params['formats'] = formats
504
+ if include_tags:
505
+ scrape_params['includeTags'] = include_tags
506
+ if exclude_tags:
507
+ scrape_params['excludeTags'] = exclude_tags
508
+ if only_main_content is not None:
509
+ scrape_params['onlyMainContent'] = only_main_content
510
+ if wait_for:
511
+ scrape_params['waitFor'] = wait_for
512
+ if timeout:
513
+ scrape_params['timeout'] = timeout
514
+ if location:
515
+ scrape_params['location'] = location.dict(exclude_none=True)
516
+ if mobile is not None:
517
+ scrape_params['mobile'] = mobile
518
+ if skip_tls_verification is not None:
519
+ scrape_params['skipTlsVerification'] = skip_tls_verification
520
+ if remove_base64_images is not None:
521
+ scrape_params['removeBase64Images'] = remove_base64_images
522
+ if block_ads is not None:
523
+ scrape_params['blockAds'] = block_ads
524
+ if proxy:
525
+ scrape_params['proxy'] = proxy
526
+ if extract:
527
+ if hasattr(extract.schema, 'schema'):
528
+ extract.schema = extract.schema.schema()
529
+ scrape_params['extract'] = extract.dict(exclude_none=True)
530
+ if json_options:
531
+ if hasattr(json_options.schema, 'schema'):
532
+ json_options.schema = json_options.schema.schema()
533
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
534
+ if actions:
535
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
536
+ scrape_params.update(kwargs)
537
+
538
+ # Make request
197
539
  response = requests.post(
198
- f'{self.api_url}{endpoint}',
540
+ f'{self.api_url}/v1/scrape',
199
541
  headers=headers,
200
542
  json=scrape_params,
201
- timeout=(scrape_params["timeout"] + 5000 if "timeout" in scrape_params else None),
543
+ timeout=(timeout + 5000 if timeout else None)
202
544
  )
545
+
203
546
  if response.status_code == 200:
204
547
  try:
205
- response = response.json()
206
- except:
207
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
208
- if response['success'] and 'data' in response:
209
- return response['data']
210
- elif "error" in response:
211
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
212
- else:
213
- raise Exception(f'Failed to scrape URL. Error: {response}')
548
+ response_json = response.json()
549
+ if response_json.get('success') and 'data' in response_json:
550
+ return ScrapeResponse(**response_json['data'])
551
+ elif "error" in response_json:
552
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
553
+ else:
554
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
555
+ except ValueError:
556
+ raise Exception('Failed to parse Firecrawl response as JSON.')
214
557
  else:
215
558
  self._handle_error(response, 'scrape URL')
216
559
 
217
- def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
560
+ def search(
561
+ self,
562
+ query: str,
563
+ *,
564
+ limit: Optional[int] = None,
565
+ tbs: Optional[str] = None,
566
+ filter: Optional[str] = None,
567
+ lang: Optional[str] = None,
568
+ country: Optional[str] = None,
569
+ location: Optional[str] = None,
570
+ timeout: Optional[int] = None,
571
+ scrape_options: Optional[CommonOptions] = None,
572
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
573
+ **kwargs) -> SearchResponse:
218
574
  """
219
- Search for content using the Firecrawl API.
575
+ Search for content using Firecrawl.
220
576
 
221
577
  Args:
222
- query (str): The search query string.
223
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
578
+ query (str): Search query string
579
+ limit (Optional[int]): Max results (default: 5)
580
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
581
+ filter (Optional[str]): Custom result filter
582
+ lang (Optional[str]): Language code (default: "en")
583
+ country (Optional[str]): Country code (default: "us")
584
+ location (Optional[str]): Geo-targeting
585
+ timeout (Optional[int]): Request timeout in milliseconds
586
+ scrape_options (Optional[CommonOptions]): Result scraping configuration
587
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
588
+ **kwargs: Additional keyword arguments for future compatibility
224
589
 
225
590
  Returns:
226
- Dict[str, Any]: The search response containing success status and search results.
591
+ SearchResponse: Response containing:
592
+ * success (bool): Whether request succeeded
593
+ * data (List[FirecrawlDocument]): Search results
594
+ * warning (Optional[str]): Warning message if any
595
+ * error (Optional[str]): Error message if any
596
+
597
+ Raises:
598
+ Exception: If search fails or response cannot be parsed
227
599
  """
228
- if params is None:
229
- params = {}
600
+ # Build search parameters
601
+ search_params = {}
602
+ if params:
603
+ if isinstance(params, dict):
604
+ search_params.update(params)
605
+ else:
606
+ search_params.update(params.dict(exclude_none=True))
607
+
608
+ # Add individual parameters
609
+ if limit is not None:
610
+ search_params['limit'] = limit
611
+ if tbs is not None:
612
+ search_params['tbs'] = tbs
613
+ if filter is not None:
614
+ search_params['filter'] = filter
615
+ if lang is not None:
616
+ search_params['lang'] = lang
617
+ if country is not None:
618
+ search_params['country'] = country
619
+ if location is not None:
620
+ search_params['location'] = location
621
+ if timeout is not None:
622
+ search_params['timeout'] = timeout
623
+ if scrape_options is not None:
624
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
625
+
626
+ # Add any additional kwargs
627
+ search_params.update(kwargs)
230
628
 
231
- if isinstance(params, dict):
232
- search_params = SearchParams(query=query, **params)
233
- else:
234
- search_params = params
235
- search_params.query = query
629
+ # Create final params object
630
+ final_params = SearchParams(query=query, **search_params)
631
+ params_dict = final_params.dict(exclude_none=True)
632
+ params_dict['origin'] = f"python-sdk@{version}"
236
633
 
634
+ # Make request
237
635
  response = requests.post(
238
636
  f"{self.api_url}/v1/search",
239
637
  headers={"Authorization": f"Bearer {self.api_key}"},
240
- json=search_params.dict(exclude_none=True)
638
+ json=params_dict
241
639
  )
242
640
 
243
- if response.status_code != 200:
244
- raise Exception(f"Request failed with status code {response.status_code}")
245
-
246
- try:
247
- return response.json()
248
- except:
249
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
250
-
251
- def crawl_url(self, url: str,
252
- params: Optional[Dict[str, Any]] = None,
253
- poll_interval: Optional[int] = 2,
254
- idempotency_key: Optional[str] = None) -> Any:
641
+ if response.status_code == 200:
642
+ try:
643
+ response_json = response.json()
644
+ if response_json.get('success') and 'data' in response_json:
645
+ return SearchResponse(**response_json)
646
+ elif "error" in response_json:
647
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
648
+ else:
649
+ raise Exception(f'Search failed. Error: {response_json}')
650
+ except ValueError:
651
+ raise Exception('Failed to parse Firecrawl response as JSON.')
652
+ else:
653
+ self._handle_error(response, 'search')
654
+
655
+ def crawl_url(
656
+ self,
657
+ url: str,
658
+ *,
659
+ include_paths: Optional[List[str]] = None,
660
+ exclude_paths: Optional[List[str]] = None,
661
+ max_depth: Optional[int] = None,
662
+ max_discovery_depth: Optional[int] = None,
663
+ limit: Optional[int] = None,
664
+ allow_backward_links: Optional[bool] = None,
665
+ allow_external_links: Optional[bool] = None,
666
+ ignore_sitemap: Optional[bool] = None,
667
+ scrape_options: Optional[CommonOptions] = None,
668
+ webhook: Optional[Union[str, WebhookConfig]] = None,
669
+ deduplicate_similar_urls: Optional[bool] = None,
670
+ ignore_query_parameters: Optional[bool] = None,
671
+ regex_on_full_url: Optional[bool] = None,
672
+ poll_interval: Optional[int] = 2,
673
+ idempotency_key: Optional[str] = None,
674
+ **kwargs
675
+ ) -> CrawlStatusResponse:
255
676
  """
256
- Initiate a crawl job for the specified URL using the Firecrawl API.
677
+ Crawl a website starting from a URL.
257
678
 
258
679
  Args:
259
- url (str): The URL to crawl.
260
- params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
261
- poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
262
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
680
+ url (str): Target URL to start crawling from
681
+ include_paths (Optional[List[str]]): Patterns of URLs to include
682
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
683
+ max_depth (Optional[int]): Maximum crawl depth
684
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
685
+ limit (Optional[int]): Maximum pages to crawl
686
+ allow_backward_links (Optional[bool]): Follow parent directory links
687
+ allow_external_links (Optional[bool]): Follow external domain links
688
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
689
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
690
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
691
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
692
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
693
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
694
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
695
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
696
+ **kwargs: Additional parameters to pass to the API
263
697
 
264
698
  Returns:
265
- Dict[str, Any]: A dictionary containing the crawl results. The structure includes:
266
- - 'success' (bool): Indicates if the crawl was successful.
267
- - 'status' (str): The final status of the crawl job (e.g., 'completed').
268
- - 'completed' (int): Number of scraped pages that completed.
269
- - 'total' (int): Total number of scraped pages.
270
- - 'creditsUsed' (int): Estimated number of API credits used for this crawl.
271
- - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires.
272
- - 'data' (List[Dict]): List of all the scraped pages.
699
+ CrawlStatusResponse with:
700
+ * Crawling status and progress
701
+ * Crawled page contents
702
+ * Success/error information
273
703
 
274
704
  Raises:
275
- Exception: If the crawl job initiation or monitoring fails.
705
+ Exception: If crawl fails
276
706
  """
277
- endpoint = f'/v1/crawl'
707
+ crawl_params = {}
708
+
709
+ # Add individual parameters
710
+ if include_paths is not None:
711
+ crawl_params['includePaths'] = include_paths
712
+ if exclude_paths is not None:
713
+ crawl_params['excludePaths'] = exclude_paths
714
+ if max_depth is not None:
715
+ crawl_params['maxDepth'] = max_depth
716
+ if max_discovery_depth is not None:
717
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
718
+ if limit is not None:
719
+ crawl_params['limit'] = limit
720
+ if allow_backward_links is not None:
721
+ crawl_params['allowBackwardLinks'] = allow_backward_links
722
+ if allow_external_links is not None:
723
+ crawl_params['allowExternalLinks'] = allow_external_links
724
+ if ignore_sitemap is not None:
725
+ crawl_params['ignoreSitemap'] = ignore_sitemap
726
+ if scrape_options is not None:
727
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
728
+ if webhook is not None:
729
+ crawl_params['webhook'] = webhook
730
+ if deduplicate_similar_urls is not None:
731
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
732
+ if ignore_query_parameters is not None:
733
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
734
+ if regex_on_full_url is not None:
735
+ crawl_params['regexOnFullURL'] = regex_on_full_url
736
+
737
+ # Add any additional kwargs
738
+ crawl_params.update(kwargs)
739
+
740
+ # Create final params object
741
+ final_params = CrawlParams(**crawl_params)
742
+ params_dict = final_params.dict(exclude_none=True)
743
+ params_dict['url'] = url
744
+ params_dict['origin'] = f"python-sdk@{version}"
745
+
746
+ # Make request
278
747
  headers = self._prepare_headers(idempotency_key)
279
- json_data = {'url': url}
280
- if params:
281
- json_data.update(params)
282
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
748
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
749
+
283
750
  if response.status_code == 200:
284
751
  try:
285
752
  id = response.json().get('id')
286
753
  except:
287
754
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
288
755
  return self._monitor_job_status(id, headers, poll_interval)
289
-
290
756
  else:
291
757
  self._handle_error(response, 'start crawl job')
292
758
 
293
-
294
- def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
759
+ def async_crawl_url(
760
+ self,
761
+ url: str,
762
+ *,
763
+ include_paths: Optional[List[str]] = None,
764
+ exclude_paths: Optional[List[str]] = None,
765
+ max_depth: Optional[int] = None,
766
+ max_discovery_depth: Optional[int] = None,
767
+ limit: Optional[int] = None,
768
+ allow_backward_links: Optional[bool] = None,
769
+ allow_external_links: Optional[bool] = None,
770
+ ignore_sitemap: Optional[bool] = None,
771
+ scrape_options: Optional[CommonOptions] = None,
772
+ webhook: Optional[Union[str, WebhookConfig]] = None,
773
+ deduplicate_similar_urls: Optional[bool] = None,
774
+ ignore_query_parameters: Optional[bool] = None,
775
+ regex_on_full_url: Optional[bool] = None,
776
+ idempotency_key: Optional[str] = None,
777
+ **kwargs
778
+ ) -> CrawlResponse:
295
779
  """
296
- Initiate a crawl job asynchronously.
780
+ Start an asynchronous crawl job.
297
781
 
298
782
  Args:
299
- url (str): The URL to crawl.
300
- params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
301
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
783
+ url (str): Target URL to start crawling from
784
+ include_paths (Optional[List[str]]): Patterns of URLs to include
785
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
786
+ max_depth (Optional[int]): Maximum crawl depth
787
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
788
+ limit (Optional[int]): Maximum pages to crawl
789
+ allow_backward_links (Optional[bool]): Follow parent directory links
790
+ allow_external_links (Optional[bool]): Follow external domain links
791
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
792
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
793
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
794
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
795
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
796
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
797
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
798
+ **kwargs: Additional parameters to pass to the API
302
799
 
303
800
  Returns:
304
- Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes:
305
- - 'success' (bool): Indicates if the crawl initiation was successful.
306
- - 'id' (str): The unique identifier for the crawl job.
307
- - 'url' (str): The URL to check the status of the crawl job.
801
+ CrawlResponse with:
802
+ * success - Whether crawl started successfully
803
+ * id - Unique identifier for the crawl job
804
+ * url - Status check URL for the crawl
805
+ * error - Error message if start failed
806
+
807
+ Raises:
808
+ Exception: If crawl initiation fails
308
809
  """
309
- endpoint = f'/v1/crawl'
810
+ crawl_params = {}
811
+
812
+ # Add individual parameters
813
+ if include_paths is not None:
814
+ crawl_params['includePaths'] = include_paths
815
+ if exclude_paths is not None:
816
+ crawl_params['excludePaths'] = exclude_paths
817
+ if max_depth is not None:
818
+ crawl_params['maxDepth'] = max_depth
819
+ if max_discovery_depth is not None:
820
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
821
+ if limit is not None:
822
+ crawl_params['limit'] = limit
823
+ if allow_backward_links is not None:
824
+ crawl_params['allowBackwardLinks'] = allow_backward_links
825
+ if allow_external_links is not None:
826
+ crawl_params['allowExternalLinks'] = allow_external_links
827
+ if ignore_sitemap is not None:
828
+ crawl_params['ignoreSitemap'] = ignore_sitemap
829
+ if scrape_options is not None:
830
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
831
+ if webhook is not None:
832
+ crawl_params['webhook'] = webhook
833
+ if deduplicate_similar_urls is not None:
834
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
835
+ if ignore_query_parameters is not None:
836
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
837
+ if regex_on_full_url is not None:
838
+ crawl_params['regexOnFullURL'] = regex_on_full_url
839
+
840
+ # Add any additional kwargs
841
+ crawl_params.update(kwargs)
842
+
843
+ # Create final params object
844
+ final_params = CrawlParams(**crawl_params)
845
+ params_dict = final_params.dict(exclude_none=True)
846
+ params_dict['url'] = url
847
+ params_dict['origin'] = f"python-sdk@{version}"
848
+
849
+ # Make request
310
850
  headers = self._prepare_headers(idempotency_key)
311
- json_data = {'url': url}
312
- if params:
313
- json_data.update(params)
314
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
851
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
852
+
315
853
  if response.status_code == 200:
316
854
  try:
317
- return response.json()
855
+ return CrawlResponse(**response.json())
318
856
  except:
319
857
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
320
858
  else:
321
859
  self._handle_error(response, 'start crawl job')
322
860
 
323
- def check_crawl_status(self, id: str) -> Any:
861
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
324
862
  """
325
- Check the status of a crawl job using the Firecrawl API.
863
+ Check the status and results of a crawl job.
326
864
 
327
865
  Args:
328
- id (str): The ID of the crawl job.
866
+ id: Unique identifier for the crawl job
329
867
 
330
868
  Returns:
331
- Any: The status of the crawl job.
869
+ CrawlStatusResponse containing:
870
+
871
+ Status Information:
872
+ * status - Current state (scraping/completed/failed/cancelled)
873
+ * completed - Number of pages crawled
874
+ * total - Total pages to crawl
875
+ * creditsUsed - API credits consumed
876
+ * expiresAt - Data expiration timestamp
877
+
878
+ Results:
879
+ * data - List of crawled documents
880
+ * next - URL for next page of results (if paginated)
881
+ * success - Whether status check succeeded
882
+ * error - Error message if failed
332
883
 
333
884
  Raises:
334
- Exception: If the status check request fails.
885
+ Exception: If status check fails
335
886
  """
336
887
  endpoint = f'/v1/crawl/{id}'
337
888
 
@@ -383,28 +934,37 @@ class FirecrawlApp:
383
934
  if 'next' in status_data:
384
935
  response['next'] = status_data['next']
385
936
 
386
- return {
387
- 'success': False if 'error' in status_data else True,
937
+ return CrawlStatusResponse(
938
+ success=False if 'error' in status_data else True,
388
939
  **response
389
- }
940
+ )
390
941
  else:
391
942
  self._handle_error(response, 'check crawl status')
392
943
 
393
- def check_crawl_errors(self, id: str) -> Dict[str, Any]:
944
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
394
945
  """
395
946
  Returns information about crawl errors.
396
947
 
397
948
  Args:
398
- id (str): The ID of the crawl job.
949
+ id (str): The ID of the crawl job
399
950
 
400
951
  Returns:
401
- Dict[str, Any]: Information about crawl errors.
952
+ CrawlErrorsResponse containing:
953
+ * errors (List[Dict[str, str]]): List of errors with fields:
954
+ - id (str): Error ID
955
+ - timestamp (str): When the error occurred
956
+ - url (str): URL that caused the error
957
+ - error (str): Error message
958
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
959
+
960
+ Raises:
961
+ Exception: If error check fails
402
962
  """
403
963
  headers = self._prepare_headers()
404
964
  response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
405
965
  if response.status_code == 200:
406
966
  try:
407
- return response.json()
967
+ return CrawlErrorsResponse(**response.json())
408
968
  except:
409
969
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
410
970
  else:
@@ -412,13 +972,18 @@ class FirecrawlApp:
412
972
 
413
973
  def cancel_crawl(self, id: str) -> Dict[str, Any]:
414
974
  """
415
- Cancel an asynchronous crawl job using the Firecrawl API.
975
+ Cancel an asynchronous crawl job.
416
976
 
417
977
  Args:
418
- id (str): The ID of the crawl job to cancel.
978
+ id (str): The ID of the crawl job to cancel
419
979
 
420
980
  Returns:
421
- Dict[str, Any]: The response from the cancel crawl request.
981
+ Dict[str, Any] containing:
982
+ * success (bool): Whether cancellation was successful
983
+ * error (str, optional): Error message if cancellation failed
984
+
985
+ Raises:
986
+ Exception: If cancellation fails
422
987
  """
423
988
  headers = self._prepare_headers()
424
989
  response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
@@ -430,154 +995,524 @@ class FirecrawlApp:
430
995
  else:
431
996
  self._handle_error(response, "cancel crawl job")
432
997
 
433
- def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
998
+ def crawl_url_and_watch(
999
+ self,
1000
+ url: str,
1001
+ *,
1002
+ include_paths: Optional[List[str]] = None,
1003
+ exclude_paths: Optional[List[str]] = None,
1004
+ max_depth: Optional[int] = None,
1005
+ max_discovery_depth: Optional[int] = None,
1006
+ limit: Optional[int] = None,
1007
+ allow_backward_links: Optional[bool] = None,
1008
+ allow_external_links: Optional[bool] = None,
1009
+ ignore_sitemap: Optional[bool] = None,
1010
+ scrape_options: Optional[CommonOptions] = None,
1011
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1012
+ deduplicate_similar_urls: Optional[bool] = None,
1013
+ ignore_query_parameters: Optional[bool] = None,
1014
+ regex_on_full_url: Optional[bool] = None,
1015
+ idempotency_key: Optional[str] = None,
1016
+ **kwargs
1017
+ ) -> 'CrawlWatcher':
434
1018
  """
435
1019
  Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
436
1020
 
437
1021
  Args:
438
- url (str): The URL to crawl.
439
- params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
440
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1022
+ url (str): Target URL to start crawling from
1023
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1024
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1025
+ max_depth (Optional[int]): Maximum crawl depth
1026
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1027
+ limit (Optional[int]): Maximum pages to crawl
1028
+ allow_backward_links (Optional[bool]): Follow parent directory links
1029
+ allow_external_links (Optional[bool]): Follow external domain links
1030
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1031
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
1032
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1033
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1034
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1035
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1036
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1037
+ **kwargs: Additional parameters to pass to the API
441
1038
 
442
1039
  Returns:
443
- CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job.
1040
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1041
+
1042
+ Raises:
1043
+ Exception: If crawl job fails to start
444
1044
  """
445
- crawl_response = self.async_crawl_url(url, params, idempotency_key)
446
- if crawl_response['success'] and 'id' in crawl_response:
447
- return CrawlWatcher(crawl_response['id'], self)
1045
+ crawl_response = self.async_crawl_url(
1046
+ url,
1047
+ include_paths=include_paths,
1048
+ exclude_paths=exclude_paths,
1049
+ max_depth=max_depth,
1050
+ max_discovery_depth=max_discovery_depth,
1051
+ limit=limit,
1052
+ allow_backward_links=allow_backward_links,
1053
+ allow_external_links=allow_external_links,
1054
+ ignore_sitemap=ignore_sitemap,
1055
+ scrape_options=scrape_options,
1056
+ webhook=webhook,
1057
+ deduplicate_similar_urls=deduplicate_similar_urls,
1058
+ ignore_query_parameters=ignore_query_parameters,
1059
+ regex_on_full_url=regex_on_full_url,
1060
+ idempotency_key=idempotency_key,
1061
+ **kwargs
1062
+ )
1063
+ if crawl_response.success and crawl_response.id:
1064
+ return CrawlWatcher(crawl_response.id, self)
448
1065
  else:
449
1066
  raise Exception("Crawl job failed to start")
450
1067
 
451
- def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
1068
+ def map_url(
1069
+ self,
1070
+ url: str,
1071
+ *,
1072
+ search: Optional[str] = None,
1073
+ ignore_sitemap: Optional[bool] = None,
1074
+ include_subdomains: Optional[bool] = None,
1075
+ sitemap_only: Optional[bool] = None,
1076
+ limit: Optional[int] = None,
1077
+ timeout: Optional[int] = None,
1078
+ params: Optional[MapParams] = None) -> MapResponse:
452
1079
  """
453
- Perform a map search using the Firecrawl API.
1080
+ Map and discover links from a URL.
454
1081
 
455
1082
  Args:
456
- url (str): The URL to perform the map search on.
457
- params (Optional[Dict[str, Any]]): Additional parameters for the map search.
1083
+ url (str): Target URL to map
1084
+ search (Optional[str]): Filter pattern for URLs
1085
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1086
+ include_subdomains (Optional[bool]): Include subdomain links
1087
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1088
+ limit (Optional[int]): Maximum URLs to return
1089
+ timeout (Optional[int]): Request timeout in milliseconds
1090
+ params (Optional[MapParams]): Additional mapping parameters
458
1091
 
459
1092
  Returns:
460
- List[str]: A list of URLs discovered during the map search.
461
- """
462
- endpoint = f'/v1/map'
463
- headers = self._prepare_headers()
1093
+ MapResponse: Response containing:
1094
+ * success (bool): Whether request succeeded
1095
+ * links (List[str]): Discovered URLs
1096
+ * error (Optional[str]): Error message if any
464
1097
 
465
- # Prepare the base scrape parameters with the URL
466
- json_data = {'url': url}
1098
+ Raises:
1099
+ Exception: If mapping fails or response cannot be parsed
1100
+ """
1101
+ # Build map parameters
1102
+ map_params = {}
467
1103
  if params:
468
- json_data.update(params)
469
-
470
- # Make the POST request with the prepared headers and JSON data
1104
+ map_params.update(params.dict(exclude_none=True))
1105
+
1106
+ # Add individual parameters
1107
+ if search is not None:
1108
+ map_params['search'] = search
1109
+ if ignore_sitemap is not None:
1110
+ map_params['ignoreSitemap'] = ignore_sitemap
1111
+ if include_subdomains is not None:
1112
+ map_params['includeSubdomains'] = include_subdomains
1113
+ if sitemap_only is not None:
1114
+ map_params['sitemapOnly'] = sitemap_only
1115
+ if limit is not None:
1116
+ map_params['limit'] = limit
1117
+ if timeout is not None:
1118
+ map_params['timeout'] = timeout
1119
+
1120
+ # Create final params object
1121
+ final_params = MapParams(**map_params)
1122
+ params_dict = final_params.dict(exclude_none=True)
1123
+ params_dict['url'] = url
1124
+ params_dict['origin'] = f"python-sdk@{version}"
1125
+
1126
+ # Make request
471
1127
  response = requests.post(
472
- f'{self.api_url}{endpoint}',
473
- headers=headers,
474
- json=json_data,
1128
+ f"{self.api_url}/v1/map",
1129
+ headers={"Authorization": f"Bearer {self.api_key}"},
1130
+ json=params_dict
475
1131
  )
1132
+
476
1133
  if response.status_code == 200:
477
1134
  try:
478
- response = response.json()
479
- except:
480
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
481
- if response['success'] and 'links' in response:
482
- return response
483
- elif 'error' in response:
484
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
485
- else:
486
- raise Exception(f'Failed to map URL. Error: {response}')
1135
+ response_json = response.json()
1136
+ if response_json.get('success') and 'links' in response_json:
1137
+ return MapResponse(**response_json)
1138
+ elif "error" in response_json:
1139
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1140
+ else:
1141
+ raise Exception(f'Map failed. Error: {response_json}')
1142
+ except ValueError:
1143
+ raise Exception('Failed to parse Firecrawl response as JSON.')
487
1144
  else:
488
1145
  self._handle_error(response, 'map')
489
1146
 
490
- def batch_scrape_urls(self, urls: List[str],
491
- params: Optional[Dict[str, Any]] = None,
492
- poll_interval: Optional[int] = 2,
493
- idempotency_key: Optional[str] = None) -> Any:
1147
+ def batch_scrape_urls(
1148
+ self,
1149
+ urls: List[str],
1150
+ *,
1151
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1152
+ headers: Optional[Dict[str, str]] = None,
1153
+ include_tags: Optional[List[str]] = None,
1154
+ exclude_tags: Optional[List[str]] = None,
1155
+ only_main_content: Optional[bool] = None,
1156
+ wait_for: Optional[int] = None,
1157
+ timeout: Optional[int] = None,
1158
+ location: Optional[LocationConfig] = None,
1159
+ mobile: Optional[bool] = None,
1160
+ skip_tls_verification: Optional[bool] = None,
1161
+ remove_base64_images: Optional[bool] = None,
1162
+ block_ads: Optional[bool] = None,
1163
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1164
+ extract: Optional[ExtractConfig] = None,
1165
+ json_options: Optional[ExtractConfig] = None,
1166
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1167
+ agent: Optional[AgentOptions] = None,
1168
+ poll_interval: Optional[int] = 2,
1169
+ idempotency_key: Optional[str] = None,
1170
+ **kwargs
1171
+ ) -> BatchScrapeStatusResponse:
494
1172
  """
495
- Initiate a batch scrape job for the specified URLs using the Firecrawl API.
1173
+ Batch scrape multiple URLs and monitor until completion.
496
1174
 
497
1175
  Args:
498
- urls (List[str]): The URLs to scrape.
499
- params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
500
- poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
501
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1176
+ urls (List[str]): URLs to scrape
1177
+ formats (Optional[List[Literal]]): Content formats to retrieve
1178
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1179
+ include_tags (Optional[List[str]]): HTML tags to include
1180
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1181
+ only_main_content (Optional[bool]): Extract main content only
1182
+ wait_for (Optional[int]): Wait time in milliseconds
1183
+ timeout (Optional[int]): Request timeout in milliseconds
1184
+ location (Optional[LocationConfig]): Location configuration
1185
+ mobile (Optional[bool]): Use mobile user agent
1186
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1187
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1188
+ block_ads (Optional[bool]): Block advertisements
1189
+ proxy (Optional[Literal]): Proxy type to use
1190
+ extract (Optional[ExtractConfig]): Content extraction config
1191
+ json_options (Optional[ExtractConfig]): JSON extraction config
1192
+ actions (Optional[List[Union]]): Actions to perform
1193
+ agent (Optional[AgentOptions]): Agent configuration
1194
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1195
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1196
+ **kwargs: Additional parameters to pass to the API
502
1197
 
503
1198
  Returns:
504
- Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
505
- - 'success' (bool): Indicates if the batch scrape was successful.
506
- - 'status' (str): The final status of the batch scrape job (e.g., 'completed').
507
- - 'completed' (int): Number of scraped pages that completed.
508
- - 'total' (int): Total number of scraped pages.
509
- - 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
510
- - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
511
- - 'data' (List[Dict]): List of all the scraped pages.
1199
+ BatchScrapeStatusResponse with:
1200
+ * Scraping status and progress
1201
+ * Scraped content for each URL
1202
+ * Success/error information
512
1203
 
513
1204
  Raises:
514
- Exception: If the batch scrape job initiation or monitoring fails.
1205
+ Exception: If batch scrape fails
515
1206
  """
516
- endpoint = f'/v1/batch/scrape'
1207
+ scrape_params = {}
1208
+
1209
+ # Add individual parameters
1210
+ if formats is not None:
1211
+ scrape_params['formats'] = formats
1212
+ if headers is not None:
1213
+ scrape_params['headers'] = headers
1214
+ if include_tags is not None:
1215
+ scrape_params['includeTags'] = include_tags
1216
+ if exclude_tags is not None:
1217
+ scrape_params['excludeTags'] = exclude_tags
1218
+ if only_main_content is not None:
1219
+ scrape_params['onlyMainContent'] = only_main_content
1220
+ if wait_for is not None:
1221
+ scrape_params['waitFor'] = wait_for
1222
+ if timeout is not None:
1223
+ scrape_params['timeout'] = timeout
1224
+ if location is not None:
1225
+ scrape_params['location'] = location.dict(exclude_none=True)
1226
+ if mobile is not None:
1227
+ scrape_params['mobile'] = mobile
1228
+ if skip_tls_verification is not None:
1229
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1230
+ if remove_base64_images is not None:
1231
+ scrape_params['removeBase64Images'] = remove_base64_images
1232
+ if block_ads is not None:
1233
+ scrape_params['blockAds'] = block_ads
1234
+ if proxy is not None:
1235
+ scrape_params['proxy'] = proxy
1236
+ if extract is not None:
1237
+ if hasattr(extract.schema, 'schema'):
1238
+ extract.schema = extract.schema.schema()
1239
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1240
+ if json_options is not None:
1241
+ if hasattr(json_options.schema, 'schema'):
1242
+ json_options.schema = json_options.schema.schema()
1243
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1244
+ if actions is not None:
1245
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1246
+ if agent is not None:
1247
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1248
+
1249
+ # Add any additional kwargs
1250
+ scrape_params.update(kwargs)
1251
+
1252
+ # Create final params object
1253
+ final_params = ScrapeParams(**scrape_params)
1254
+ params_dict = final_params.dict(exclude_none=True)
1255
+ params_dict['urls'] = urls
1256
+ params_dict['origin'] = f"python-sdk@{version}"
1257
+
1258
+ # Make request
517
1259
  headers = self._prepare_headers(idempotency_key)
518
- json_data = {'urls': urls}
519
- if params:
520
- json_data.update(params)
521
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
1260
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1261
+
522
1262
  if response.status_code == 200:
523
1263
  try:
524
1264
  id = response.json().get('id')
525
1265
  except:
526
1266
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
527
1267
  return self._monitor_job_status(id, headers, poll_interval)
528
-
529
1268
  else:
530
1269
  self._handle_error(response, 'start batch scrape job')
531
1270
 
532
-
533
- def async_batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
1271
+ def async_batch_scrape_urls(
1272
+ self,
1273
+ urls: List[str],
1274
+ *,
1275
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1276
+ headers: Optional[Dict[str, str]] = None,
1277
+ include_tags: Optional[List[str]] = None,
1278
+ exclude_tags: Optional[List[str]] = None,
1279
+ only_main_content: Optional[bool] = None,
1280
+ wait_for: Optional[int] = None,
1281
+ timeout: Optional[int] = None,
1282
+ location: Optional[LocationConfig] = None,
1283
+ mobile: Optional[bool] = None,
1284
+ skip_tls_verification: Optional[bool] = None,
1285
+ remove_base64_images: Optional[bool] = None,
1286
+ block_ads: Optional[bool] = None,
1287
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1288
+ extract: Optional[ExtractConfig] = None,
1289
+ json_options: Optional[ExtractConfig] = None,
1290
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1291
+ agent: Optional[AgentOptions] = None,
1292
+ idempotency_key: Optional[str] = None,
1293
+ **kwargs
1294
+ ) -> BatchScrapeResponse:
534
1295
  """
535
- Initiate a crawl job asynchronously.
1296
+ Initiate a batch scrape job asynchronously.
536
1297
 
537
1298
  Args:
538
- urls (List[str]): The URLs to scrape.
539
- params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
540
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1299
+ urls (List[str]): URLs to scrape
1300
+ formats (Optional[List[Literal]]): Content formats to retrieve
1301
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1302
+ include_tags (Optional[List[str]]): HTML tags to include
1303
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1304
+ only_main_content (Optional[bool]): Extract main content only
1305
+ wait_for (Optional[int]): Wait time in milliseconds
1306
+ timeout (Optional[int]): Request timeout in milliseconds
1307
+ location (Optional[LocationConfig]): Location configuration
1308
+ mobile (Optional[bool]): Use mobile user agent
1309
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1310
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1311
+ block_ads (Optional[bool]): Block advertisements
1312
+ proxy (Optional[Literal]): Proxy type to use
1313
+ extract (Optional[ExtractConfig]): Content extraction config
1314
+ json_options (Optional[ExtractConfig]): JSON extraction config
1315
+ actions (Optional[List[Union]]): Actions to perform
1316
+ agent (Optional[AgentOptions]): Agent configuration
1317
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1318
+ **kwargs: Additional parameters to pass to the API
541
1319
 
542
1320
  Returns:
543
- Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
544
- - 'success' (bool): Indicates if the batch scrape initiation was successful.
545
- - 'id' (str): The unique identifier for the batch scrape job.
546
- - 'url' (str): The URL to check the status of the batch scrape job.
1321
+ BatchScrapeResponse with:
1322
+ * success - Whether job started successfully
1323
+ * id - Unique identifier for the job
1324
+ * url - Status check URL
1325
+ * error - Error message if start failed
1326
+
1327
+ Raises:
1328
+ Exception: If job initiation fails
547
1329
  """
548
- endpoint = f'/v1/batch/scrape'
1330
+ scrape_params = {}
1331
+
1332
+ # Add individual parameters
1333
+ if formats is not None:
1334
+ scrape_params['formats'] = formats
1335
+ if headers is not None:
1336
+ scrape_params['headers'] = headers
1337
+ if include_tags is not None:
1338
+ scrape_params['includeTags'] = include_tags
1339
+ if exclude_tags is not None:
1340
+ scrape_params['excludeTags'] = exclude_tags
1341
+ if only_main_content is not None:
1342
+ scrape_params['onlyMainContent'] = only_main_content
1343
+ if wait_for is not None:
1344
+ scrape_params['waitFor'] = wait_for
1345
+ if timeout is not None:
1346
+ scrape_params['timeout'] = timeout
1347
+ if location is not None:
1348
+ scrape_params['location'] = location.dict(exclude_none=True)
1349
+ if mobile is not None:
1350
+ scrape_params['mobile'] = mobile
1351
+ if skip_tls_verification is not None:
1352
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1353
+ if remove_base64_images is not None:
1354
+ scrape_params['removeBase64Images'] = remove_base64_images
1355
+ if block_ads is not None:
1356
+ scrape_params['blockAds'] = block_ads
1357
+ if proxy is not None:
1358
+ scrape_params['proxy'] = proxy
1359
+ if extract is not None:
1360
+ if hasattr(extract.schema, 'schema'):
1361
+ extract.schema = extract.schema.schema()
1362
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1363
+ if json_options is not None:
1364
+ if hasattr(json_options.schema, 'schema'):
1365
+ json_options.schema = json_options.schema.schema()
1366
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1367
+ if actions is not None:
1368
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1369
+ if agent is not None:
1370
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1371
+
1372
+ # Add any additional kwargs
1373
+ scrape_params.update(kwargs)
1374
+
1375
+ # Create final params object
1376
+ final_params = ScrapeParams(**scrape_params)
1377
+ params_dict = final_params.dict(exclude_none=True)
1378
+ params_dict['urls'] = urls
1379
+ params_dict['origin'] = f"python-sdk@{version}"
1380
+
1381
+ # Make request
549
1382
  headers = self._prepare_headers(idempotency_key)
550
- json_data = {'urls': urls}
551
- if params:
552
- json_data.update(params)
553
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
1383
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1384
+
554
1385
  if response.status_code == 200:
555
1386
  try:
556
- return response.json()
1387
+ return BatchScrapeResponse(**response.json())
557
1388
  except:
558
1389
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
559
1390
  else:
560
1391
  self._handle_error(response, 'start batch scrape job')
561
1392
 
562
- def batch_scrape_urls_and_watch(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
1393
+ def batch_scrape_urls_and_watch(
1394
+ self,
1395
+ urls: List[str],
1396
+ *,
1397
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1398
+ headers: Optional[Dict[str, str]] = None,
1399
+ include_tags: Optional[List[str]] = None,
1400
+ exclude_tags: Optional[List[str]] = None,
1401
+ only_main_content: Optional[bool] = None,
1402
+ wait_for: Optional[int] = None,
1403
+ timeout: Optional[int] = None,
1404
+ location: Optional[LocationConfig] = None,
1405
+ mobile: Optional[bool] = None,
1406
+ skip_tls_verification: Optional[bool] = None,
1407
+ remove_base64_images: Optional[bool] = None,
1408
+ block_ads: Optional[bool] = None,
1409
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1410
+ extract: Optional[ExtractConfig] = None,
1411
+ json_options: Optional[ExtractConfig] = None,
1412
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1413
+ agent: Optional[AgentOptions] = None,
1414
+ idempotency_key: Optional[str] = None,
1415
+ **kwargs
1416
+ ) -> 'CrawlWatcher':
563
1417
  """
564
1418
  Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
565
1419
 
566
1420
  Args:
567
- urls (List[str]): The URLs to scrape.
568
- params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
569
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1421
+ urls (List[str]): URLs to scrape
1422
+ formats (Optional[List[Literal]]): Content formats to retrieve
1423
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1424
+ include_tags (Optional[List[str]]): HTML tags to include
1425
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1426
+ only_main_content (Optional[bool]): Extract main content only
1427
+ wait_for (Optional[int]): Wait time in milliseconds
1428
+ timeout (Optional[int]): Request timeout in milliseconds
1429
+ location (Optional[LocationConfig]): Location configuration
1430
+ mobile (Optional[bool]): Use mobile user agent
1431
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1432
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1433
+ block_ads (Optional[bool]): Block advertisements
1434
+ proxy (Optional[Literal]): Proxy type to use
1435
+ extract (Optional[ExtractConfig]): Content extraction config
1436
+ json_options (Optional[ExtractConfig]): JSON extraction config
1437
+ actions (Optional[List[Union]]): Actions to perform
1438
+ agent (Optional[AgentOptions]): Agent configuration
1439
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1440
+ **kwargs: Additional parameters to pass to the API
570
1441
 
571
1442
  Returns:
572
- CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
1443
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1444
+
1445
+ Raises:
1446
+ Exception: If batch scrape job fails to start
573
1447
  """
574
- crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
575
- if crawl_response['success'] and 'id' in crawl_response:
576
- return CrawlWatcher(crawl_response['id'], self)
1448
+ scrape_params = {}
1449
+
1450
+ # Add individual parameters
1451
+ if formats is not None:
1452
+ scrape_params['formats'] = formats
1453
+ if headers is not None:
1454
+ scrape_params['headers'] = headers
1455
+ if include_tags is not None:
1456
+ scrape_params['includeTags'] = include_tags
1457
+ if exclude_tags is not None:
1458
+ scrape_params['excludeTags'] = exclude_tags
1459
+ if only_main_content is not None:
1460
+ scrape_params['onlyMainContent'] = only_main_content
1461
+ if wait_for is not None:
1462
+ scrape_params['waitFor'] = wait_for
1463
+ if timeout is not None:
1464
+ scrape_params['timeout'] = timeout
1465
+ if location is not None:
1466
+ scrape_params['location'] = location.dict(exclude_none=True)
1467
+ if mobile is not None:
1468
+ scrape_params['mobile'] = mobile
1469
+ if skip_tls_verification is not None:
1470
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1471
+ if remove_base64_images is not None:
1472
+ scrape_params['removeBase64Images'] = remove_base64_images
1473
+ if block_ads is not None:
1474
+ scrape_params['blockAds'] = block_ads
1475
+ if proxy is not None:
1476
+ scrape_params['proxy'] = proxy
1477
+ if extract is not None:
1478
+ if hasattr(extract.schema, 'schema'):
1479
+ extract.schema = extract.schema.schema()
1480
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1481
+ if json_options is not None:
1482
+ if hasattr(json_options.schema, 'schema'):
1483
+ json_options.schema = json_options.schema.schema()
1484
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1485
+ if actions is not None:
1486
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1487
+ if agent is not None:
1488
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1489
+
1490
+ # Add any additional kwargs
1491
+ scrape_params.update(kwargs)
1492
+
1493
+ # Create final params object
1494
+ final_params = ScrapeParams(**scrape_params)
1495
+ params_dict = final_params.dict(exclude_none=True)
1496
+ params_dict['urls'] = urls
1497
+ params_dict['origin'] = f"python-sdk@{version}"
1498
+
1499
+ # Make request
1500
+ headers = self._prepare_headers(idempotency_key)
1501
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1502
+
1503
+ if response.status_code == 200:
1504
+ try:
1505
+ crawl_response = BatchScrapeResponse(**response.json())
1506
+ if crawl_response.success and crawl_response.id:
1507
+ return CrawlWatcher(crawl_response.id, self)
1508
+ else:
1509
+ raise Exception("Batch scrape job failed to start")
1510
+ except:
1511
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
577
1512
  else:
578
- raise Exception("Batch scrape job failed to start")
1513
+ self._handle_error(response, 'start batch scrape job')
579
1514
 
580
- def check_batch_scrape_status(self, id: str) -> Any:
1515
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
581
1516
  """
582
1517
  Check the status of a batch scrape job using the Firecrawl API.
583
1518
 
@@ -585,7 +1520,7 @@ class FirecrawlApp:
585
1520
  id (str): The ID of the batch scrape job.
586
1521
 
587
1522
  Returns:
588
- Any: The status of the batch scrape job.
1523
+ BatchScrapeStatusResponse: The status of the batch scrape job.
589
1524
 
590
1525
  Raises:
591
1526
  Exception: If the status check request fails.
@@ -625,29 +1560,21 @@ class FirecrawlApp:
625
1560
  break
626
1561
  status_data['data'] = data
627
1562
 
628
- response = {
1563
+ return BatchScrapeStatusResponse(**{
1564
+ 'success': False if 'error' in status_data else True,
629
1565
  'status': status_data.get('status'),
630
1566
  'total': status_data.get('total'),
631
1567
  'completed': status_data.get('completed'),
632
1568
  'creditsUsed': status_data.get('creditsUsed'),
633
1569
  'expiresAt': status_data.get('expiresAt'),
634
- 'data': status_data.get('data')
635
- }
636
-
637
- if 'error' in status_data:
638
- response['error'] = status_data['error']
639
-
640
- if 'next' in status_data:
641
- response['next'] = status_data['next']
642
-
643
- return {
644
- 'success': False if 'error' in status_data else True,
645
- **response
646
- }
1570
+ 'data': status_data.get('data'),
1571
+ 'next': status_data.get('next'),
1572
+ 'error': status_data.get('error')
1573
+ })
647
1574
  else:
648
1575
  self._handle_error(response, 'check batch scrape status')
649
1576
 
650
- def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
1577
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
651
1578
  """
652
1579
  Returns information about batch scrape errors.
653
1580
 
@@ -655,38 +1582,68 @@ class FirecrawlApp:
655
1582
  id (str): The ID of the crawl job.
656
1583
 
657
1584
  Returns:
658
- Dict[str, Any]: Information about crawl errors.
1585
+ CrawlErrorsResponse: A response containing:
1586
+ * errors (List[Dict[str, str]]): List of errors with fields:
1587
+ * id (str): Error ID
1588
+ * timestamp (str): When the error occurred
1589
+ * url (str): URL that caused the error
1590
+ * error (str): Error message
1591
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1592
+
1593
+ Raises:
1594
+ Exception: If the error check request fails
659
1595
  """
660
1596
  headers = self._prepare_headers()
661
1597
  response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
662
1598
  if response.status_code == 200:
663
1599
  try:
664
- return response.json()
1600
+ return CrawlErrorsResponse(**response.json())
665
1601
  except:
666
1602
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
667
1603
  else:
668
1604
  self._handle_error(response, "check batch scrape errors")
669
1605
 
670
- def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any:
1606
+ def extract(
1607
+ self,
1608
+ urls: Optional[List[str]] = None,
1609
+ *,
1610
+ prompt: Optional[str] = None,
1611
+ schema: Optional[Any] = None,
1612
+ system_prompt: Optional[str] = None,
1613
+ allow_external_links: Optional[bool] = False,
1614
+ enable_web_search: Optional[bool] = False,
1615
+ show_sources: Optional[bool] = False,
1616
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
671
1617
  """
672
- Extracts information from a URL using the Firecrawl API.
1618
+ Extract structured information from URLs.
673
1619
 
674
1620
  Args:
675
- urls (Optional[List[str]]): The URLs to extract information from.
676
- params (Optional[ExtractParams]): Additional parameters for the extract request.
1621
+ urls (Optional[List[str]]): URLs to extract from
1622
+ prompt (Optional[str]): Custom extraction prompt
1623
+ schema (Optional[Any]): JSON schema/Pydantic model
1624
+ system_prompt (Optional[str]): System context
1625
+ allow_external_links (Optional[bool]): Follow external links
1626
+ enable_web_search (Optional[bool]): Enable web search
1627
+ show_sources (Optional[bool]): Include source URLs
1628
+ agent (Optional[Dict[str, Any]]): Agent configuration
677
1629
 
678
1630
  Returns:
679
- Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
1631
+ ExtractResponse[Any] with:
1632
+ * success (bool): Whether request succeeded
1633
+ * data (Optional[Any]): Extracted data matching schema
1634
+ * error (Optional[str]): Error message if any
1635
+
1636
+ Raises:
1637
+ ValueError: If prompt/schema missing or extraction fails
680
1638
  """
681
1639
  headers = self._prepare_headers()
682
1640
 
683
- if not params or (not params.get('prompt') and not params.get('schema')):
1641
+ if not prompt and not schema:
684
1642
  raise ValueError("Either prompt or schema is required")
685
1643
 
686
- if not urls and not params.get('prompt'):
1644
+ if not urls and not prompt:
687
1645
  raise ValueError("Either urls or prompt is required")
688
1646
 
689
- schema = params.get('schema')
690
1647
  if schema:
691
1648
  if hasattr(schema, 'model_json_schema'):
692
1649
  # Convert Pydantic model to JSON schema
@@ -694,26 +1651,22 @@ class FirecrawlApp:
694
1651
  # Otherwise assume it's already a JSON schema dict
695
1652
 
696
1653
  request_data = {
697
- 'urls': urls,
698
- 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
699
- 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)),
700
- 'showSources': params.get('show_sources', params.get('showSources', False)),
1654
+ 'urls': urls or [],
1655
+ 'allowExternalLinks': allow_external_links,
1656
+ 'enableWebSearch': enable_web_search,
1657
+ 'showSources': show_sources,
701
1658
  'schema': schema,
702
- 'origin': 'api-sdk'
1659
+ 'origin': f'python-sdk@{get_version()}'
703
1660
  }
704
1661
 
705
- if not request_data['urls']:
706
- request_data['urls'] = []
707
1662
  # Only add prompt and systemPrompt if they exist
708
- if params.get('prompt'):
709
- request_data['prompt'] = params['prompt']
710
- if params.get('system_prompt'):
711
- request_data['systemPrompt'] = params['system_prompt']
712
- elif params.get('systemPrompt'): # Check legacy field name
713
- request_data['systemPrompt'] = params['systemPrompt']
1663
+ if prompt:
1664
+ request_data['prompt'] = prompt
1665
+ if system_prompt:
1666
+ request_data['systemPrompt'] = system_prompt
714
1667
 
715
- if params.get('agent'):
716
- request_data['agent'] = params['agent']
1668
+ if agent:
1669
+ request_data['agent'] = agent
717
1670
 
718
1671
  try:
719
1672
  # Send the initial extract request
@@ -744,10 +1697,7 @@ class FirecrawlApp:
744
1697
  except:
745
1698
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
746
1699
  if status_data['status'] == 'completed':
747
- if status_data['success']:
748
- return status_data
749
- else:
750
- raise Exception(f'Failed to extract. Error: {status_data["error"]}')
1700
+ return ExtractResponse(**status_data)
751
1701
  elif status_data['status'] in ['failed', 'cancelled']:
752
1702
  raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
753
1703
  else:
@@ -761,9 +1711,9 @@ class FirecrawlApp:
761
1711
  except Exception as e:
762
1712
  raise ValueError(str(e), 500)
763
1713
 
764
- return {'success': False, 'error': "Internal server error."}
1714
+ return ExtractResponse(success=False, error="Internal server error.")
765
1715
 
766
- def get_extract_status(self, job_id: str) -> Dict[str, Any]:
1716
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
767
1717
  """
768
1718
  Retrieve the status of an extract job.
769
1719
 
@@ -771,7 +1721,7 @@ class FirecrawlApp:
771
1721
  job_id (str): The ID of the extract job.
772
1722
 
773
1723
  Returns:
774
- Dict[str, Any]: The status of the extract job.
1724
+ ExtractResponse[Any]: The status of the extract job.
775
1725
 
776
1726
  Raises:
777
1727
  ValueError: If there is an error retrieving the status.
@@ -781,7 +1731,7 @@ class FirecrawlApp:
781
1731
  response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
782
1732
  if response.status_code == 200:
783
1733
  try:
784
- return response.json()
1734
+ return ExtractResponse(**response.json())
785
1735
  except:
786
1736
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
787
1737
  else:
@@ -789,43 +1739,71 @@ class FirecrawlApp:
789
1739
  except Exception as e:
790
1740
  raise ValueError(str(e), 500)
791
1741
 
792
- def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
1742
+ def async_extract(
1743
+ self,
1744
+ urls: List[str],
1745
+ *,
1746
+ prompt: Optional[str] = None,
1747
+ schema: Optional[Any] = None,
1748
+ system_prompt: Optional[str] = None,
1749
+ allow_external_links: Optional[bool] = False,
1750
+ enable_web_search: Optional[bool] = False,
1751
+ show_sources: Optional[bool] = False,
1752
+ agent: Optional[Dict[str, Any]] = None,
1753
+ idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
793
1754
  """
794
1755
  Initiate an asynchronous extract job.
795
1756
 
796
1757
  Args:
797
- urls (List[str]): The URLs to extract data from.
798
- params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
799
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
1758
+ urls (List[str]): URLs to extract information from
1759
+ prompt (Optional[str]): Custom extraction prompt
1760
+ schema (Optional[Any]): JSON schema/Pydantic model
1761
+ system_prompt (Optional[str]): System context
1762
+ allow_external_links (Optional[bool]): Follow external links
1763
+ enable_web_search (Optional[bool]): Enable web search
1764
+ show_sources (Optional[bool]): Include source URLs
1765
+ agent (Optional[Dict[str, Any]]): Agent configuration
1766
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
800
1767
 
801
1768
  Returns:
802
- Dict[str, Any]: The response from the extract operation.
1769
+ ExtractResponse[Any] with:
1770
+ * success (bool): Whether request succeeded
1771
+ * data (Optional[Any]): Extracted data matching schema
1772
+ * error (Optional[str]): Error message if any
803
1773
 
804
1774
  Raises:
805
- ValueError: If there is an error initiating the extract job.
1775
+ ValueError: If job initiation fails
806
1776
  """
807
1777
  headers = self._prepare_headers(idempotency_key)
808
1778
 
809
- schema = params.get('schema') if params else None
1779
+ schema = schema
810
1780
  if schema:
811
1781
  if hasattr(schema, 'model_json_schema'):
812
1782
  # Convert Pydantic model to JSON schema
813
1783
  schema = schema.model_json_schema()
814
1784
  # Otherwise assume it's already a JSON schema dict
815
1785
 
816
- jsonData = {'urls': urls, **(params or {})}
817
1786
  request_data = {
818
- **jsonData,
819
- 'allowExternalLinks': params.get('allow_external_links', False) if params else False,
1787
+ 'urls': urls,
1788
+ 'allowExternalLinks': allow_external_links,
1789
+ 'enableWebSearch': enable_web_search,
1790
+ 'showSources': show_sources,
820
1791
  'schema': schema,
821
- 'origin': 'api-sdk'
1792
+ 'origin': f'python-sdk@{version}'
822
1793
  }
823
1794
 
1795
+ if prompt:
1796
+ request_data['prompt'] = prompt
1797
+ if system_prompt:
1798
+ request_data['systemPrompt'] = system_prompt
1799
+ if agent:
1800
+ request_data['agent'] = agent
1801
+
824
1802
  try:
825
1803
  response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
826
1804
  if response.status_code == 200:
827
1805
  try:
828
- return response.json()
1806
+ return ExtractResponse(**response.json())
829
1807
  except:
830
1808
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
831
1809
  else:
@@ -833,34 +1811,44 @@ class FirecrawlApp:
833
1811
  except Exception as e:
834
1812
  raise ValueError(str(e), 500)
835
1813
 
836
- def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
1814
+ def generate_llms_text(
1815
+ self,
1816
+ url: str,
1817
+ *,
1818
+ max_urls: Optional[int] = None,
1819
+ show_full_text: Optional[bool] = None,
1820
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
837
1821
  """
838
1822
  Generate LLMs.txt for a given URL and poll until completion.
839
1823
 
840
1824
  Args:
841
- url (str): The URL to generate LLMs.txt from.
842
- params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
1825
+ url (str): Target URL to generate LLMs.txt from
1826
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1827
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1828
+ experimental_stream (Optional[bool]): Enable experimental streaming
843
1829
 
844
1830
  Returns:
845
- Dict[str, Any]: A dictionary containing the generation results. The structure includes:
846
- - 'success' (bool): Indicates if the generation was successful.
847
- - 'status' (str): The final status of the generation job.
848
- - 'data' (Dict): The generated LLMs.txt data.
849
- - 'error' (Optional[str]): Error message if the generation failed.
850
- - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
1831
+ GenerateLLMsTextStatusResponse with:
1832
+ * Generated LLMs.txt content
1833
+ * Full version if requested
1834
+ * Generation status
1835
+ * Success/error information
851
1836
 
852
1837
  Raises:
853
- Exception: If the generation job fails or an error occurs during status checks.
1838
+ Exception: If generation fails
854
1839
  """
855
- if params is None:
856
- params = {}
857
-
858
- if isinstance(params, dict):
859
- generation_params = GenerateLLMsTextParams(**params)
860
- else:
861
- generation_params = params
1840
+ params = GenerateLLMsTextParams(
1841
+ maxUrls=max_urls,
1842
+ showFullText=show_full_text,
1843
+ __experimental_stream=experimental_stream
1844
+ )
862
1845
 
863
- response = self.async_generate_llms_text(url, generation_params)
1846
+ response = self.async_generate_llms_text(
1847
+ url,
1848
+ max_urls=max_urls,
1849
+ show_full_text=show_full_text,
1850
+ experimental_stream=experimental_stream
1851
+ )
864
1852
  if not response.get('success') or 'id' not in response:
865
1853
  return response
866
1854
 
@@ -879,32 +1867,40 @@ class FirecrawlApp:
879
1867
 
880
1868
  return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
881
1869
 
882
- def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
1870
+ def async_generate_llms_text(
1871
+ self,
1872
+ url: str,
1873
+ *,
1874
+ max_urls: Optional[int] = None,
1875
+ show_full_text: Optional[bool] = None,
1876
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
883
1877
  """
884
1878
  Initiate an asynchronous LLMs.txt generation operation.
885
1879
 
886
1880
  Args:
887
- url (str): The URL to generate LLMs.txt from.
888
- params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
1881
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1882
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1883
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1884
+ experimental_stream (Optional[bool]): Enable experimental streaming
889
1885
 
890
1886
  Returns:
891
- Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes:
892
- - 'success' (bool): Indicates if the generation initiation was successful.
893
- - 'id' (str): The unique identifier for the generation job.
1887
+ GenerateLLMsTextResponse: A response containing:
1888
+ * success (bool): Whether the generation initiation was successful
1889
+ * id (str): The unique identifier for the generation job
1890
+ * error (str, optional): Error message if initiation failed
894
1891
 
895
1892
  Raises:
896
1893
  Exception: If the generation job initiation fails.
897
1894
  """
898
- if params is None:
899
- params = {}
900
-
901
- if isinstance(params, dict):
902
- generation_params = GenerateLLMsTextParams(**params)
903
- else:
904
- generation_params = params
1895
+ params = GenerateLLMsTextParams(
1896
+ maxUrls=max_urls,
1897
+ showFullText=show_full_text,
1898
+ __experimental_stream=experimental_stream
1899
+ )
905
1900
 
906
1901
  headers = self._prepare_headers()
907
- json_data = {'url': url, **generation_params.dict(exclude_none=True)}
1902
+ json_data = {'url': url, **params.dict(exclude_none=True)}
1903
+ json_data['origin'] = f"python-sdk@{version}"
908
1904
 
909
1905
  try:
910
1906
  response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
@@ -920,15 +1916,22 @@ class FirecrawlApp:
920
1916
 
921
1917
  return {'success': False, 'error': 'Internal server error'}
922
1918
 
923
- def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]:
1919
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
924
1920
  """
925
1921
  Check the status of a LLMs.txt generation operation.
926
1922
 
927
1923
  Args:
928
- id (str): The ID of the LLMs.txt generation operation.
1924
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
929
1925
 
930
1926
  Returns:
931
- Dict[str, Any]: The current status and results of the generation operation.
1927
+ GenerateLLMsTextStatusResponse: A response containing:
1928
+ * success (bool): Whether the generation was successful
1929
+ * status (str): Status of generation ("processing", "completed", "failed")
1930
+ * data (Dict[str, str], optional): Generated text with fields:
1931
+ * llmstxt (str): Generated LLMs.txt content
1932
+ * llmsfulltxt (str, optional): Full version if requested
1933
+ * error (str, optional): Error message if generation failed
1934
+ * expiresAt (str): When the generated data expires
932
1935
 
933
1936
  Raises:
934
1937
  Exception: If the status check fails.
@@ -950,7 +1953,9 @@ class FirecrawlApp:
950
1953
 
951
1954
  return {'success': False, 'error': 'Internal server error'}
952
1955
 
953
- def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
1956
+ def _prepare_headers(
1957
+ self,
1958
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
954
1959
  """
955
1960
  Prepare the headers for API requests.
956
1961
 
@@ -972,11 +1977,13 @@ class FirecrawlApp:
972
1977
  'Authorization': f'Bearer {self.api_key}',
973
1978
  }
974
1979
 
975
- def _post_request(self, url: str,
976
- data: Dict[str, Any],
977
- headers: Dict[str, str],
978
- retries: int = 3,
979
- backoff_factor: float = 0.5) -> requests.Response:
1980
+ def _post_request(
1981
+ self,
1982
+ url: str,
1983
+ data: Dict[str, Any],
1984
+ headers: Dict[str, str],
1985
+ retries: int = 3,
1986
+ backoff_factor: float = 0.5) -> requests.Response:
980
1987
  """
981
1988
  Make a POST request with retries.
982
1989
 
@@ -1001,10 +2008,12 @@ class FirecrawlApp:
1001
2008
  return response
1002
2009
  return response
1003
2010
 
1004
- def _get_request(self, url: str,
1005
- headers: Dict[str, str],
1006
- retries: int = 3,
1007
- backoff_factor: float = 0.5) -> requests.Response:
2011
+ def _get_request(
2012
+ self,
2013
+ url: str,
2014
+ headers: Dict[str, str],
2015
+ retries: int = 3,
2016
+ backoff_factor: float = 0.5) -> requests.Response:
1008
2017
  """
1009
2018
  Make a GET request with retries.
1010
2019
 
@@ -1028,10 +2037,12 @@ class FirecrawlApp:
1028
2037
  return response
1029
2038
  return response
1030
2039
 
1031
- def _delete_request(self, url: str,
1032
- headers: Dict[str, str],
1033
- retries: int = 3,
1034
- backoff_factor: float = 0.5) -> requests.Response:
2040
+ def _delete_request(
2041
+ self,
2042
+ url: str,
2043
+ headers: Dict[str, str],
2044
+ retries: int = 3,
2045
+ backoff_factor: float = 0.5) -> requests.Response:
1035
2046
  """
1036
2047
  Make a DELETE request with retries.
1037
2048
 
@@ -1055,16 +2066,21 @@ class FirecrawlApp:
1055
2066
  return response
1056
2067
  return response
1057
2068
 
1058
- def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any:
2069
+ def _monitor_job_status(
2070
+ self,
2071
+ id: str,
2072
+ headers: Dict[str, str],
2073
+ poll_interval: int) -> CrawlStatusResponse:
1059
2074
  """
1060
2075
  Monitor the status of a crawl job until completion.
1061
2076
 
1062
2077
  Args:
1063
2078
  id (str): The ID of the crawl job.
1064
2079
  headers (Dict[str, str]): The headers to include in the status check requests.
1065
- poll_interval (int): Secounds between status checks.
2080
+ poll_interval (int): Seconds between status checks.
2081
+
1066
2082
  Returns:
1067
- Any: The crawl results if the job is completed successfully.
2083
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
1068
2084
 
1069
2085
  Raises:
1070
2086
  Exception: If the job fails or an error occurs during status checks.
@@ -1091,7 +2107,7 @@ class FirecrawlApp:
1091
2107
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
1092
2108
  data.extend(status_data.get('data', []))
1093
2109
  status_data['data'] = data
1094
- return status_data
2110
+ return CrawlStatusResponse(**status_data)
1095
2111
  else:
1096
2112
  raise Exception('Crawl job completed but no data was returned')
1097
2113
  elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
@@ -1102,7 +2118,10 @@ class FirecrawlApp:
1102
2118
  else:
1103
2119
  self._handle_error(status_response, 'check crawl status')
1104
2120
 
1105
- def _handle_error(self, response: requests.Response, action: str) -> None:
2121
+ def _handle_error(
2122
+ self,
2123
+ response: requests.Response,
2124
+ action: str) -> None:
1106
2125
  """
1107
2126
  Handle errors from API responses.
1108
2127
 
@@ -1119,49 +2138,100 @@ class FirecrawlApp:
1119
2138
  except:
1120
2139
  raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
1121
2140
 
1122
-
1123
- if response.status_code == 402:
1124
- message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
1125
- elif response.status_code == 403:
1126
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
1127
- elif response.status_code == 408:
1128
- message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
1129
- elif response.status_code == 409:
1130
- message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
1131
- elif response.status_code == 500:
1132
- message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
1133
- else:
1134
- message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
2141
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
1135
2142
 
1136
2143
  # Raise an HTTPError with the custom message and attach the response
1137
2144
  raise requests.exceptions.HTTPError(message, response=response)
1138
2145
 
1139
- def deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None,
1140
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
1141
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> Dict[str, Any]:
2146
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2147
+ """
2148
+ Generate a standardized error message based on HTTP status code.
2149
+
2150
+ Args:
2151
+ status_code (int): The HTTP status code from the response
2152
+ action (str): Description of the action that was being performed
2153
+ error_message (str): The error message from the API response
2154
+ error_details (str): Additional error details from the API response
2155
+
2156
+ Returns:
2157
+ str: A formatted error message
2158
+ """
2159
+ if status_code == 402:
2160
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2161
+ elif status_code == 403:
2162
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2163
+ elif status_code == 408:
2164
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2165
+ elif status_code == 409:
2166
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2167
+ elif status_code == 500:
2168
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2169
+ else:
2170
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2171
+
2172
+ def deep_research(
2173
+ self,
2174
+ query: str,
2175
+ *,
2176
+ max_depth: Optional[int] = None,
2177
+ time_limit: Optional[int] = None,
2178
+ max_urls: Optional[int] = None,
2179
+ analysis_prompt: Optional[str] = None,
2180
+ system_prompt: Optional[str] = None,
2181
+ __experimental_stream_steps: Optional[bool] = None,
2182
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2183
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
1142
2184
  """
1143
2185
  Initiates a deep research operation on a given query and polls until completion.
1144
2186
 
1145
2187
  Args:
1146
- query (str): The query to research.
1147
- params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation.
1148
- on_activity (Optional[Callable[[Dict[str, Any]], None]]): Optional callback to receive activity updates in real-time.
2188
+ query (str): Research query or topic to investigate
2189
+ max_depth (Optional[int]): Maximum depth of research exploration
2190
+ time_limit (Optional[int]): Time limit in seconds for research
2191
+ max_urls (Optional[int]): Maximum number of URLs to process
2192
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2193
+ system_prompt (Optional[str]): Custom system prompt
2194
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2195
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2196
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
1149
2197
 
1150
2198
  Returns:
1151
- Dict[str, Any]: The final research results.
2199
+ DeepResearchStatusResponse containing:
2200
+ * success (bool): Whether research completed successfully
2201
+ * status (str): Current state (processing/completed/failed)
2202
+ * error (Optional[str]): Error message if failed
2203
+ * id (str): Unique identifier for the research job
2204
+ * data (Any): Research findings and analysis
2205
+ * sources (List[Dict]): List of discovered sources
2206
+ * activities (List[Dict]): Research progress log
2207
+ * summaries (List[str]): Generated research summaries
1152
2208
 
1153
2209
  Raises:
1154
- Exception: If the research operation fails.
2210
+ Exception: If research fails
1155
2211
  """
1156
- if params is None:
1157
- params = {}
1158
-
1159
- if isinstance(params, dict):
1160
- research_params = DeepResearchParams(**params)
1161
- else:
1162
- research_params = params
1163
-
1164
- response = self.async_deep_research(query, research_params)
2212
+ research_params = {}
2213
+ if max_depth is not None:
2214
+ research_params['maxDepth'] = max_depth
2215
+ if time_limit is not None:
2216
+ research_params['timeLimit'] = time_limit
2217
+ if max_urls is not None:
2218
+ research_params['maxUrls'] = max_urls
2219
+ if analysis_prompt is not None:
2220
+ research_params['analysisPrompt'] = analysis_prompt
2221
+ if system_prompt is not None:
2222
+ research_params['systemPrompt'] = system_prompt
2223
+ if __experimental_stream_steps is not None:
2224
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2225
+ research_params = DeepResearchParams(**research_params)
2226
+
2227
+ response = self.async_deep_research(
2228
+ query,
2229
+ max_depth=max_depth,
2230
+ time_limit=time_limit,
2231
+ max_urls=max_urls,
2232
+ analysis_prompt=analysis_prompt,
2233
+ system_prompt=system_prompt
2234
+ )
1165
2235
  if not response.get('success') or 'id' not in response:
1166
2236
  return response
1167
2237
 
@@ -1194,31 +2264,57 @@ class FirecrawlApp:
1194
2264
  time.sleep(2) # Polling interval
1195
2265
 
1196
2266
  return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
1197
- def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
2267
+
2268
+ def async_deep_research(
2269
+ self,
2270
+ query: str,
2271
+ *,
2272
+ max_depth: Optional[int] = None,
2273
+ time_limit: Optional[int] = None,
2274
+ max_urls: Optional[int] = None,
2275
+ analysis_prompt: Optional[str] = None,
2276
+ system_prompt: Optional[str] = None,
2277
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
1198
2278
  """
1199
2279
  Initiates an asynchronous deep research operation.
1200
2280
 
1201
2281
  Args:
1202
- query (str): The query to research.
1203
- params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation.
2282
+ query (str): Research query or topic to investigate
2283
+ max_depth (Optional[int]): Maximum depth of research exploration
2284
+ time_limit (Optional[int]): Time limit in seconds for research
2285
+ max_urls (Optional[int]): Maximum number of URLs to process
2286
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2287
+ system_prompt (Optional[str]): Custom system prompt
2288
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
1204
2289
 
1205
2290
  Returns:
1206
- Dict[str, Any]: The response from the deep research initiation.
2291
+ Dict[str, Any]: A response containing:
2292
+ * success (bool): Whether the research initiation was successful
2293
+ * id (str): The unique identifier for the research job
2294
+ * error (str, optional): Error message if initiation failed
1207
2295
 
1208
2296
  Raises:
1209
2297
  Exception: If the research initiation fails.
1210
2298
  """
1211
- if params is None:
1212
- params = {}
1213
-
1214
- if isinstance(params, dict):
1215
- research_params = DeepResearchParams(**params)
1216
- else:
1217
- research_params = params
2299
+ research_params = {}
2300
+ if max_depth is not None:
2301
+ research_params['maxDepth'] = max_depth
2302
+ if time_limit is not None:
2303
+ research_params['timeLimit'] = time_limit
2304
+ if max_urls is not None:
2305
+ research_params['maxUrls'] = max_urls
2306
+ if analysis_prompt is not None:
2307
+ research_params['analysisPrompt'] = analysis_prompt
2308
+ if system_prompt is not None:
2309
+ research_params['systemPrompt'] = system_prompt
2310
+ if __experimental_stream_steps is not None:
2311
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2312
+ research_params = DeepResearchParams(**research_params)
1218
2313
 
1219
2314
  headers = self._prepare_headers()
1220
2315
 
1221
2316
  json_data = {'query': query, **research_params.dict(exclude_none=True)}
2317
+ json_data['origin'] = f"python-sdk@{version}"
1222
2318
 
1223
2319
  # Handle json options schema if present
1224
2320
  if 'jsonOptions' in json_data:
@@ -1240,7 +2336,7 @@ class FirecrawlApp:
1240
2336
 
1241
2337
  return {'success': False, 'error': 'Internal server error'}
1242
2338
 
1243
- def check_deep_research_status(self, id: str) -> Dict[str, Any]:
2339
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
1244
2340
  """
1245
2341
  Check the status of a deep research operation.
1246
2342
 
@@ -1248,7 +2344,19 @@ class FirecrawlApp:
1248
2344
  id (str): The ID of the deep research operation.
1249
2345
 
1250
2346
  Returns:
1251
- Dict[str, Any]: The current status and results of the research operation.
2347
+ DeepResearchResponse containing:
2348
+
2349
+ Status:
2350
+ * success - Whether research completed successfully
2351
+ * status - Current state (processing/completed/failed)
2352
+ * error - Error message if failed
2353
+
2354
+ Results:
2355
+ * id - Unique identifier for the research job
2356
+ * data - Research findings and analysis
2357
+ * sources - List of discovered sources
2358
+ * activities - Research progress log
2359
+ * summaries - Generated research summaries
1252
2360
 
1253
2361
  Raises:
1254
2362
  Exception: If the status check fails.
@@ -1271,6 +2379,17 @@ class FirecrawlApp:
1271
2379
  return {'success': False, 'error': 'Internal server error'}
1272
2380
 
1273
2381
  class CrawlWatcher:
2382
+ """
2383
+ A class to watch and handle crawl job events via WebSocket connection.
2384
+
2385
+ Attributes:
2386
+ id (str): The ID of the crawl job to watch
2387
+ app (FirecrawlApp): The FirecrawlApp instance
2388
+ data (List[Dict[str, Any]]): List of crawled documents/data
2389
+ status (str): Current status of the crawl job
2390
+ ws_url (str): WebSocket URL for the crawl job
2391
+ event_handlers (dict): Dictionary of event type to list of handler functions
2392
+ """
1274
2393
  def __init__(self, id: str, app: FirecrawlApp):
1275
2394
  self.id = id
1276
2395
  self.app = app
@@ -1283,25 +2402,57 @@ class CrawlWatcher:
1283
2402
  'document': []
1284
2403
  }
1285
2404
 
1286
- async def connect(self):
1287
- async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket:
2405
+ async def connect(self) -> None:
2406
+ """
2407
+ Establishes WebSocket connection and starts listening for messages.
2408
+ """
2409
+ async with websockets.connect(
2410
+ self.ws_url,
2411
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2412
+ ) as websocket:
1288
2413
  await self._listen(websocket)
1289
2414
 
1290
- async def _listen(self, websocket):
2415
+ async def _listen(self, websocket) -> None:
2416
+ """
2417
+ Listens for incoming WebSocket messages and handles them.
2418
+
2419
+ Args:
2420
+ websocket: The WebSocket connection object
2421
+ """
1291
2422
  async for message in websocket:
1292
2423
  msg = json.loads(message)
1293
2424
  await self._handle_message(msg)
1294
2425
 
1295
- def add_event_listener(self, event_type: str, handler):
2426
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2427
+ """
2428
+ Adds an event handler function for a specific event type.
2429
+
2430
+ Args:
2431
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2432
+ handler (Callable): Function to handle the event
2433
+ """
1296
2434
  if event_type in self.event_handlers:
1297
2435
  self.event_handlers[event_type].append(handler)
1298
2436
 
1299
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]):
2437
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2438
+ """
2439
+ Dispatches an event to all registered handlers for that event type.
2440
+
2441
+ Args:
2442
+ event_type (str): Type of event to dispatch
2443
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2444
+ """
1300
2445
  if event_type in self.event_handlers:
1301
2446
  for handler in self.event_handlers[event_type]:
1302
2447
  handler(detail)
1303
2448
 
1304
- async def _handle_message(self, msg: Dict[str, Any]):
2449
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2450
+ """
2451
+ Handles incoming WebSocket messages based on their type.
2452
+
2453
+ Args:
2454
+ msg (Dict[str, Any]): The message to handle
2455
+ """
1305
2456
  if msg['type'] == 'done':
1306
2457
  self.status = 'completed'
1307
2458
  self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
@@ -1316,3 +2467,1773 @@ class CrawlWatcher:
1316
2467
  elif msg['type'] == 'document':
1317
2468
  self.data.append(msg['data'])
1318
2469
  self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2470
+
2471
+ class AsyncFirecrawlApp(FirecrawlApp):
2472
+ """
2473
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2474
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2475
+ """
2476
+
2477
+ async def _async_request(
2478
+ self,
2479
+ method: str,
2480
+ url: str,
2481
+ headers: Dict[str, str],
2482
+ data: Optional[Dict[str, Any]] = None,
2483
+ retries: int = 3,
2484
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2485
+ """
2486
+ Generic async request method with exponential backoff retry logic.
2487
+
2488
+ Args:
2489
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2490
+ url (str): The URL to send the request to.
2491
+ headers (Dict[str, str]): Headers to include in the request.
2492
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2493
+ retries (int): Maximum number of retry attempts (default: 3).
2494
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2495
+ Delay will be backoff_factor * (2 ** retry_count).
2496
+
2497
+ Returns:
2498
+ Dict[str, Any]: The parsed JSON response from the server.
2499
+
2500
+ Raises:
2501
+ aiohttp.ClientError: If the request fails after all retries.
2502
+ Exception: If max retries are exceeded or other errors occur.
2503
+ """
2504
+ async with aiohttp.ClientSession() as session:
2505
+ for attempt in range(retries):
2506
+ try:
2507
+ async with session.request(
2508
+ method=method, url=url, headers=headers, json=data
2509
+ ) as response:
2510
+ if response.status == 502:
2511
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2512
+ continue
2513
+ if response.status >= 300:
2514
+ await self._handle_error(response, f"make {method} request")
2515
+ return await response.json()
2516
+ except aiohttp.ClientError as e:
2517
+ if attempt == retries - 1:
2518
+ raise e
2519
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2520
+ raise Exception("Max retries exceeded")
2521
+
2522
+ async def _async_post_request(
2523
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2524
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2525
+ """
2526
+ Make an async POST request with exponential backoff retry logic.
2527
+
2528
+ Args:
2529
+ url (str): The URL to send the POST request to.
2530
+ data (Dict[str, Any]): The JSON data to include in the request body.
2531
+ headers (Dict[str, str]): Headers to include in the request.
2532
+ retries (int): Maximum number of retry attempts (default: 3).
2533
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2534
+ Delay will be backoff_factor * (2 ** retry_count).
2535
+
2536
+ Returns:
2537
+ Dict[str, Any]: The parsed JSON response from the server.
2538
+
2539
+ Raises:
2540
+ aiohttp.ClientError: If the request fails after all retries.
2541
+ Exception: If max retries are exceeded or other errors occur.
2542
+ """
2543
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2544
+
2545
+ async def _async_get_request(
2546
+ self, url: str, headers: Dict[str, str],
2547
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2548
+ """
2549
+ Make an async GET request with exponential backoff retry logic.
2550
+
2551
+ Args:
2552
+ url (str): The URL to send the GET request to.
2553
+ headers (Dict[str, str]): Headers to include in the request.
2554
+ retries (int): Maximum number of retry attempts (default: 3).
2555
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2556
+ Delay will be backoff_factor * (2 ** retry_count).
2557
+
2558
+ Returns:
2559
+ Dict[str, Any]: The parsed JSON response from the server.
2560
+
2561
+ Raises:
2562
+ aiohttp.ClientError: If the request fails after all retries.
2563
+ Exception: If max retries are exceeded or other errors occur.
2564
+ """
2565
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2566
+
2567
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2568
+ """
2569
+ Handle errors from async API responses with detailed error messages.
2570
+
2571
+ Args:
2572
+ response (aiohttp.ClientResponse): The response object from the failed request
2573
+ action (str): Description of the action that was being attempted
2574
+
2575
+ Raises:
2576
+ aiohttp.ClientError: With a detailed error message based on the response status:
2577
+ - 402: Payment Required
2578
+ - 408: Request Timeout
2579
+ - 409: Conflict
2580
+ - 500: Internal Server Error
2581
+ - Other: Unexpected error with status code
2582
+ """
2583
+ try:
2584
+ error_data = await response.json()
2585
+ error_message = error_data.get('error', 'No error message provided.')
2586
+ error_details = error_data.get('details', 'No additional error details provided.')
2587
+ except:
2588
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2589
+
2590
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2591
+
2592
+ raise aiohttp.ClientError(message)
2593
+
2594
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2595
+ """
2596
+ Generate a standardized error message based on HTTP status code for async operations.
2597
+
2598
+ Args:
2599
+ status_code (int): The HTTP status code from the response
2600
+ action (str): Description of the action that was being performed
2601
+ error_message (str): The error message from the API response
2602
+ error_details (str): Additional error details from the API response
2603
+
2604
+ Returns:
2605
+ str: A formatted error message
2606
+ """
2607
+ return self._get_error_message(status_code, action, error_message, error_details)
2608
+
2609
+ async def crawl_url_and_watch(
2610
+ self,
2611
+ url: str,
2612
+ params: Optional[CrawlParams] = None,
2613
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2614
+ """
2615
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2616
+
2617
+ Args:
2618
+ url (str): Target URL to start crawling from
2619
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2620
+ URL Discovery:
2621
+ * includePaths - Patterns of URLs to include
2622
+ * excludePaths - Patterns of URLs to exclude
2623
+ * maxDepth - Maximum crawl depth
2624
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2625
+ * limit - Maximum pages to crawl
2626
+
2627
+ Link Following:
2628
+ * allowBackwardLinks - Follow parent directory links
2629
+ * allowExternalLinks - Follow external domain links
2630
+ * ignoreSitemap - Skip sitemap.xml processing
2631
+
2632
+ Advanced:
2633
+ * scrapeOptions - Page scraping configuration
2634
+ * webhook - Notification webhook settings
2635
+ * deduplicateSimilarURLs - Remove similar URLs
2636
+ * ignoreQueryParameters - Ignore URL parameters
2637
+ * regexOnFullURL - Apply regex to full URLs
2638
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2639
+
2640
+ Returns:
2641
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2642
+
2643
+ Raises:
2644
+ Exception: If crawl job fails to start
2645
+ """
2646
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2647
+ if crawl_response.get('success') and 'id' in crawl_response:
2648
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2649
+ else:
2650
+ raise Exception("Crawl job failed to start")
2651
+
2652
+ async def batch_scrape_urls_and_watch(
2653
+ self,
2654
+ urls: List[str],
2655
+ params: Optional[ScrapeParams] = None,
2656
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2657
+ """
2658
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2659
+
2660
+ Args:
2661
+ urls (List[str]): List of URLs to scrape
2662
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2663
+
2664
+ Content Options:
2665
+ * formats - Content formats to retrieve
2666
+ * includeTags - HTML tags to include
2667
+ * excludeTags - HTML tags to exclude
2668
+ * onlyMainContent - Extract main content only
2669
+
2670
+ Request Options:
2671
+ * headers - Custom HTTP headers
2672
+ * timeout - Request timeout (ms)
2673
+ * mobile - Use mobile user agent
2674
+ * proxy - Proxy type
2675
+
2676
+ Extraction Options:
2677
+ * extract - Content extraction config
2678
+ * jsonOptions - JSON extraction config
2679
+ * actions - Actions to perform
2680
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2681
+
2682
+ Returns:
2683
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2684
+
2685
+ Raises:
2686
+ Exception: If batch scrape job fails to start
2687
+ """
2688
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2689
+ if batch_response.get('success') and 'id' in batch_response:
2690
+ return AsyncCrawlWatcher(batch_response['id'], self)
2691
+ else:
2692
+ raise Exception("Batch scrape job failed to start")
2693
+
2694
+ async def scrape_url(
2695
+ self,
2696
+ url: str,
2697
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2698
+ include_tags: Optional[List[str]] = None,
2699
+ exclude_tags: Optional[List[str]] = None,
2700
+ only_main_content: Optional[bool] = None,
2701
+ wait_for: Optional[int] = None,
2702
+ timeout: Optional[int] = None,
2703
+ location: Optional[LocationConfig] = None,
2704
+ mobile: Optional[bool] = None,
2705
+ skip_tls_verification: Optional[bool] = None,
2706
+ remove_base64_images: Optional[bool] = None,
2707
+ block_ads: Optional[bool] = None,
2708
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2709
+ extract: Optional[ExtractConfig] = None,
2710
+ json_options: Optional[ExtractConfig] = None,
2711
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
2712
+ """
2713
+ Scrape and extract content from a URL asynchronously.
2714
+
2715
+ Args:
2716
+ url (str): Target URL to scrape
2717
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2718
+ include_tags (Optional[List[str]]): HTML tags to include
2719
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2720
+ only_main_content (Optional[bool]): Extract main content only
2721
+ wait_for (Optional[int]): Wait for a specific element to appear
2722
+ timeout (Optional[int]): Request timeout (ms)
2723
+ location (Optional[LocationConfig]): Location configuration
2724
+ mobile (Optional[bool]): Use mobile user agent
2725
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2726
+ remove_base64_images (Optional[bool]): Remove base64 images
2727
+ block_ads (Optional[bool]): Block ads
2728
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2729
+ extract (Optional[ExtractConfig]): Content extraction settings
2730
+ json_options (Optional[ExtractConfig]): JSON extraction settings
2731
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2732
+
2733
+ Returns:
2734
+ ScrapeResponse with:
2735
+ * Requested content formats
2736
+ * Page metadata
2737
+ * Extraction results
2738
+ * Success/error status
2739
+
2740
+ Raises:
2741
+ Exception: If scraping fails
2742
+ """
2743
+ headers = self._prepare_headers()
2744
+
2745
+ # Build scrape parameters
2746
+ scrape_params = {
2747
+ 'url': url,
2748
+ 'origin': f"python-sdk@{version}"
2749
+ }
2750
+
2751
+ # Add optional parameters if provided and not None
2752
+ if formats:
2753
+ scrape_params['formats'] = formats
2754
+ if include_tags:
2755
+ scrape_params['includeTags'] = include_tags
2756
+ if exclude_tags:
2757
+ scrape_params['excludeTags'] = exclude_tags
2758
+ if only_main_content is not None:
2759
+ scrape_params['onlyMainContent'] = only_main_content
2760
+ if wait_for:
2761
+ scrape_params['waitFor'] = wait_for
2762
+ if timeout:
2763
+ scrape_params['timeout'] = timeout
2764
+ if location:
2765
+ scrape_params['location'] = location.dict(exclude_none=True)
2766
+ if mobile is not None:
2767
+ scrape_params['mobile'] = mobile
2768
+ if skip_tls_verification is not None:
2769
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2770
+ if remove_base64_images is not None:
2771
+ scrape_params['removeBase64Images'] = remove_base64_images
2772
+ if block_ads is not None:
2773
+ scrape_params['blockAds'] = block_ads
2774
+ if proxy:
2775
+ scrape_params['proxy'] = proxy
2776
+ if extract:
2777
+ extract_dict = extract.dict(exclude_none=True)
2778
+ if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
2779
+ extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
2780
+ scrape_params['extract'] = extract_dict
2781
+ if json_options:
2782
+ json_options_dict = json_options.dict(exclude_none=True)
2783
+ if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
2784
+ json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
2785
+ scrape_params['jsonOptions'] = json_options_dict
2786
+ if actions:
2787
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2788
+
2789
+ # Make async request
2790
+ endpoint = f'/v1/scrape'
2791
+ response = await self._async_post_request(
2792
+ f'{self.api_url}{endpoint}',
2793
+ scrape_params,
2794
+ headers
2795
+ )
2796
+
2797
+ if response.get('success') and 'data' in response:
2798
+ return ScrapeResponse(**response['data'])
2799
+ elif "error" in response:
2800
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2801
+ else:
2802
+ # Use the response content directly if possible, otherwise a generic message
2803
+ error_content = response.get('error', str(response))
2804
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
2805
+
2806
+ async def batch_scrape_urls(
2807
+ self,
2808
+ urls: List[str],
2809
+ *,
2810
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2811
+ headers: Optional[Dict[str, str]] = None,
2812
+ include_tags: Optional[List[str]] = None,
2813
+ exclude_tags: Optional[List[str]] = None,
2814
+ only_main_content: Optional[bool] = None,
2815
+ wait_for: Optional[int] = None,
2816
+ timeout: Optional[int] = None,
2817
+ location: Optional[LocationConfig] = None,
2818
+ mobile: Optional[bool] = None,
2819
+ skip_tls_verification: Optional[bool] = None,
2820
+ remove_base64_images: Optional[bool] = None,
2821
+ block_ads: Optional[bool] = None,
2822
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2823
+ extract: Optional[ExtractConfig] = None,
2824
+ json_options: Optional[ExtractConfig] = None,
2825
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2826
+ agent: Optional[AgentOptions] = None,
2827
+ poll_interval: Optional[int] = 2,
2828
+ idempotency_key: Optional[str] = None,
2829
+ **kwargs
2830
+ ) -> BatchScrapeStatusResponse:
2831
+ """
2832
+ Asynchronously scrape multiple URLs and monitor until completion.
2833
+
2834
+ Args:
2835
+ urls (List[str]): URLs to scrape
2836
+ formats (Optional[List[Literal]]): Content formats to retrieve
2837
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2838
+ include_tags (Optional[List[str]]): HTML tags to include
2839
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2840
+ only_main_content (Optional[bool]): Extract main content only
2841
+ wait_for (Optional[int]): Wait time in milliseconds
2842
+ timeout (Optional[int]): Request timeout in milliseconds
2843
+ location (Optional[LocationConfig]): Location configuration
2844
+ mobile (Optional[bool]): Use mobile user agent
2845
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2846
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2847
+ block_ads (Optional[bool]): Block advertisements
2848
+ proxy (Optional[Literal]): Proxy type to use
2849
+ extract (Optional[ExtractConfig]): Content extraction config
2850
+ json_options (Optional[ExtractConfig]): JSON extraction config
2851
+ actions (Optional[List[Union]]): Actions to perform
2852
+ agent (Optional[AgentOptions]): Agent configuration
2853
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
2854
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2855
+ **kwargs: Additional parameters to pass to the API
2856
+
2857
+ Returns:
2858
+ BatchScrapeStatusResponse with:
2859
+ * Scraping status and progress
2860
+ * Scraped content for each URL
2861
+ * Success/error information
2862
+
2863
+ Raises:
2864
+ Exception: If batch scrape fails
2865
+ """
2866
+ scrape_params = {}
2867
+
2868
+ # Add individual parameters
2869
+ if formats is not None:
2870
+ scrape_params['formats'] = formats
2871
+ if headers is not None:
2872
+ scrape_params['headers'] = headers
2873
+ if include_tags is not None:
2874
+ scrape_params['includeTags'] = include_tags
2875
+ if exclude_tags is not None:
2876
+ scrape_params['excludeTags'] = exclude_tags
2877
+ if only_main_content is not None:
2878
+ scrape_params['onlyMainContent'] = only_main_content
2879
+ if wait_for is not None:
2880
+ scrape_params['waitFor'] = wait_for
2881
+ if timeout is not None:
2882
+ scrape_params['timeout'] = timeout
2883
+ if location is not None:
2884
+ scrape_params['location'] = location.dict(exclude_none=True)
2885
+ if mobile is not None:
2886
+ scrape_params['mobile'] = mobile
2887
+ if skip_tls_verification is not None:
2888
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2889
+ if remove_base64_images is not None:
2890
+ scrape_params['removeBase64Images'] = remove_base64_images
2891
+ if block_ads is not None:
2892
+ scrape_params['blockAds'] = block_ads
2893
+ if proxy is not None:
2894
+ scrape_params['proxy'] = proxy
2895
+ if extract is not None:
2896
+ if hasattr(extract.schema, 'schema'):
2897
+ extract.schema = extract.schema.schema()
2898
+ scrape_params['extract'] = extract.dict(exclude_none=True)
2899
+ if json_options is not None:
2900
+ if hasattr(json_options.schema, 'schema'):
2901
+ json_options.schema = json_options.schema.schema()
2902
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
2903
+ if actions is not None:
2904
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2905
+ if agent is not None:
2906
+ scrape_params['agent'] = agent.dict(exclude_none=True)
2907
+
2908
+ # Add any additional kwargs
2909
+ scrape_params.update(kwargs)
2910
+
2911
+ # Create final params object
2912
+ final_params = ScrapeParams(**scrape_params)
2913
+ params_dict = final_params.dict(exclude_none=True)
2914
+ params_dict['urls'] = urls
2915
+ params_dict['origin'] = f"python-sdk@{version}"
2916
+
2917
+ # Make request
2918
+ headers = self._prepare_headers(idempotency_key)
2919
+ response = await self._async_post_request(
2920
+ f'{self.api_url}/v1/batch/scrape',
2921
+ params_dict,
2922
+ headers
2923
+ )
2924
+
2925
+ if response.status_code == 200:
2926
+ try:
2927
+ id = response.json().get('id')
2928
+ except:
2929
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2930
+ return self._monitor_job_status(id, headers, poll_interval)
2931
+ else:
2932
+ self._handle_error(response, 'start batch scrape job')
2933
+
2934
+
2935
+ async def async_batch_scrape_urls(
2936
+ self,
2937
+ urls: List[str],
2938
+ *,
2939
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2940
+ headers: Optional[Dict[str, str]] = None,
2941
+ include_tags: Optional[List[str]] = None,
2942
+ exclude_tags: Optional[List[str]] = None,
2943
+ only_main_content: Optional[bool] = None,
2944
+ wait_for: Optional[int] = None,
2945
+ timeout: Optional[int] = None,
2946
+ location: Optional[LocationConfig] = None,
2947
+ mobile: Optional[bool] = None,
2948
+ skip_tls_verification: Optional[bool] = None,
2949
+ remove_base64_images: Optional[bool] = None,
2950
+ block_ads: Optional[bool] = None,
2951
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2952
+ extract: Optional[ExtractConfig] = None,
2953
+ json_options: Optional[ExtractConfig] = None,
2954
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2955
+ agent: Optional[AgentOptions] = None,
2956
+ idempotency_key: Optional[str] = None,
2957
+ **kwargs
2958
+ ) -> BatchScrapeResponse:
2959
+ """
2960
+ Initiate a batch scrape job asynchronously.
2961
+
2962
+ Args:
2963
+ urls (List[str]): URLs to scrape
2964
+ formats (Optional[List[Literal]]): Content formats to retrieve
2965
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2966
+ include_tags (Optional[List[str]]): HTML tags to include
2967
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2968
+ only_main_content (Optional[bool]): Extract main content only
2969
+ wait_for (Optional[int]): Wait time in milliseconds
2970
+ timeout (Optional[int]): Request timeout in milliseconds
2971
+ location (Optional[LocationConfig]): Location configuration
2972
+ mobile (Optional[bool]): Use mobile user agent
2973
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2974
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2975
+ block_ads (Optional[bool]): Block advertisements
2976
+ proxy (Optional[Literal]): Proxy type to use
2977
+ extract (Optional[ExtractConfig]): Content extraction config
2978
+ json_options (Optional[ExtractConfig]): JSON extraction config
2979
+ actions (Optional[List[Union]]): Actions to perform
2980
+ agent (Optional[AgentOptions]): Agent configuration
2981
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2982
+ **kwargs: Additional parameters to pass to the API
2983
+
2984
+ Returns:
2985
+ BatchScrapeResponse with:
2986
+ * success - Whether job started successfully
2987
+ * id - Unique identifier for the job
2988
+ * url - Status check URL
2989
+ * error - Error message if start failed
2990
+
2991
+ Raises:
2992
+ Exception: If job initiation fails
2993
+ """
2994
+ scrape_params = {}
2995
+
2996
+ # Add individual parameters
2997
+ if formats is not None:
2998
+ scrape_params['formats'] = formats
2999
+ if headers is not None:
3000
+ scrape_params['headers'] = headers
3001
+ if include_tags is not None:
3002
+ scrape_params['includeTags'] = include_tags
3003
+ if exclude_tags is not None:
3004
+ scrape_params['excludeTags'] = exclude_tags
3005
+ if only_main_content is not None:
3006
+ scrape_params['onlyMainContent'] = only_main_content
3007
+ if wait_for is not None:
3008
+ scrape_params['waitFor'] = wait_for
3009
+ if timeout is not None:
3010
+ scrape_params['timeout'] = timeout
3011
+ if location is not None:
3012
+ scrape_params['location'] = location.dict(exclude_none=True)
3013
+ if mobile is not None:
3014
+ scrape_params['mobile'] = mobile
3015
+ if skip_tls_verification is not None:
3016
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3017
+ if remove_base64_images is not None:
3018
+ scrape_params['removeBase64Images'] = remove_base64_images
3019
+ if block_ads is not None:
3020
+ scrape_params['blockAds'] = block_ads
3021
+ if proxy is not None:
3022
+ scrape_params['proxy'] = proxy
3023
+ if extract is not None:
3024
+ if hasattr(extract.schema, 'schema'):
3025
+ extract.schema = extract.schema.schema()
3026
+ scrape_params['extract'] = extract.dict(exclude_none=True)
3027
+ if json_options is not None:
3028
+ if hasattr(json_options.schema, 'schema'):
3029
+ json_options.schema = json_options.schema.schema()
3030
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
3031
+ if actions is not None:
3032
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3033
+ if agent is not None:
3034
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3035
+
3036
+ # Add any additional kwargs
3037
+ scrape_params.update(kwargs)
3038
+
3039
+ # Create final params object
3040
+ final_params = ScrapeParams(**scrape_params)
3041
+ params_dict = final_params.dict(exclude_none=True)
3042
+ params_dict['urls'] = urls
3043
+ params_dict['origin'] = f"python-sdk@{version}"
3044
+
3045
+ # Make request
3046
+ headers = self._prepare_headers(idempotency_key)
3047
+ response = await self._async_post_request(
3048
+ f'{self.api_url}/v1/batch/scrape',
3049
+ params_dict,
3050
+ headers
3051
+ )
3052
+
3053
+ if response.status_code == 200:
3054
+ try:
3055
+ return BatchScrapeResponse(**response.json())
3056
+ except:
3057
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3058
+ else:
3059
+ self._handle_error(response, 'start batch scrape job')
3060
+
3061
+ async def crawl_url(
3062
+ self,
3063
+ url: str,
3064
+ *,
3065
+ include_paths: Optional[List[str]] = None,
3066
+ exclude_paths: Optional[List[str]] = None,
3067
+ max_depth: Optional[int] = None,
3068
+ max_discovery_depth: Optional[int] = None,
3069
+ limit: Optional[int] = None,
3070
+ allow_backward_links: Optional[bool] = None,
3071
+ allow_external_links: Optional[bool] = None,
3072
+ ignore_sitemap: Optional[bool] = None,
3073
+ scrape_options: Optional[CommonOptions] = None,
3074
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3075
+ deduplicate_similar_urls: Optional[bool] = None,
3076
+ ignore_query_parameters: Optional[bool] = None,
3077
+ regex_on_full_url: Optional[bool] = None,
3078
+ poll_interval: Optional[int] = 2,
3079
+ idempotency_key: Optional[str] = None,
3080
+ **kwargs
3081
+ ) -> CrawlStatusResponse:
3082
+ """
3083
+ Crawl a website starting from a URL.
3084
+
3085
+ Args:
3086
+ url (str): Target URL to start crawling from
3087
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3088
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3089
+ max_depth (Optional[int]): Maximum crawl depth
3090
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3091
+ limit (Optional[int]): Maximum pages to crawl
3092
+ allow_backward_links (Optional[bool]): Follow parent directory links
3093
+ allow_external_links (Optional[bool]): Follow external domain links
3094
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3095
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
3096
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3097
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3098
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3099
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3100
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3101
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3102
+ **kwargs: Additional parameters to pass to the API
3103
+
3104
+ Returns:
3105
+ CrawlStatusResponse with:
3106
+ * Crawling status and progress
3107
+ * Crawled page contents
3108
+ * Success/error information
3109
+
3110
+ Raises:
3111
+ Exception: If crawl fails
3112
+ """
3113
+ crawl_params = {}
3114
+
3115
+ # Add individual parameters
3116
+ if include_paths is not None:
3117
+ crawl_params['includePaths'] = include_paths
3118
+ if exclude_paths is not None:
3119
+ crawl_params['excludePaths'] = exclude_paths
3120
+ if max_depth is not None:
3121
+ crawl_params['maxDepth'] = max_depth
3122
+ if max_discovery_depth is not None:
3123
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3124
+ if limit is not None:
3125
+ crawl_params['limit'] = limit
3126
+ if allow_backward_links is not None:
3127
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3128
+ if allow_external_links is not None:
3129
+ crawl_params['allowExternalLinks'] = allow_external_links
3130
+ if ignore_sitemap is not None:
3131
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3132
+ if scrape_options is not None:
3133
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3134
+ if webhook is not None:
3135
+ crawl_params['webhook'] = webhook
3136
+ if deduplicate_similar_urls is not None:
3137
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3138
+ if ignore_query_parameters is not None:
3139
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3140
+ if regex_on_full_url is not None:
3141
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3142
+
3143
+ # Add any additional kwargs
3144
+ crawl_params.update(kwargs)
3145
+
3146
+ # Create final params object
3147
+ final_params = CrawlParams(**crawl_params)
3148
+ params_dict = final_params.dict(exclude_none=True)
3149
+ params_dict['url'] = url
3150
+ params_dict['origin'] = f"python-sdk@{version}"
3151
+
3152
+ # Make request
3153
+ headers = self._prepare_headers(idempotency_key)
3154
+ response = await self._async_post_request(
3155
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3156
+
3157
+ if response.status_code == 200:
3158
+ try:
3159
+ id = response.json().get('id')
3160
+ except:
3161
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3162
+ return self._monitor_job_status(id, headers, poll_interval)
3163
+ else:
3164
+ self._handle_error(response, 'start crawl job')
3165
+
3166
+
3167
+ async def async_crawl_url(
3168
+ self,
3169
+ url: str,
3170
+ *,
3171
+ include_paths: Optional[List[str]] = None,
3172
+ exclude_paths: Optional[List[str]] = None,
3173
+ max_depth: Optional[int] = None,
3174
+ max_discovery_depth: Optional[int] = None,
3175
+ limit: Optional[int] = None,
3176
+ allow_backward_links: Optional[bool] = None,
3177
+ allow_external_links: Optional[bool] = None,
3178
+ ignore_sitemap: Optional[bool] = None,
3179
+ scrape_options: Optional[CommonOptions] = None,
3180
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3181
+ deduplicate_similar_urls: Optional[bool] = None,
3182
+ ignore_query_parameters: Optional[bool] = None,
3183
+ regex_on_full_url: Optional[bool] = None,
3184
+ idempotency_key: Optional[str] = None,
3185
+ **kwargs
3186
+ ) -> CrawlResponse:
3187
+ """
3188
+ Start an asynchronous crawl job.
3189
+
3190
+ Args:
3191
+ url (str): Target URL to start crawling from
3192
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3193
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3194
+ max_depth (Optional[int]): Maximum crawl depth
3195
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3196
+ limit (Optional[int]): Maximum pages to crawl
3197
+ allow_backward_links (Optional[bool]): Follow parent directory links
3198
+ allow_external_links (Optional[bool]): Follow external domain links
3199
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3200
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
3201
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3202
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3203
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3204
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3205
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3206
+ **kwargs: Additional parameters to pass to the API
3207
+
3208
+ Returns:
3209
+ CrawlResponse with:
3210
+ * success - Whether crawl started successfully
3211
+ * id - Unique identifier for the crawl job
3212
+ * url - Status check URL for the crawl
3213
+ * error - Error message if start failed
3214
+
3215
+ Raises:
3216
+ Exception: If crawl initiation fails
3217
+ """
3218
+ crawl_params = {}
3219
+
3220
+ # Add individual parameters
3221
+ if include_paths is not None:
3222
+ crawl_params['includePaths'] = include_paths
3223
+ if exclude_paths is not None:
3224
+ crawl_params['excludePaths'] = exclude_paths
3225
+ if max_depth is not None:
3226
+ crawl_params['maxDepth'] = max_depth
3227
+ if max_discovery_depth is not None:
3228
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3229
+ if limit is not None:
3230
+ crawl_params['limit'] = limit
3231
+ if allow_backward_links is not None:
3232
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3233
+ if allow_external_links is not None:
3234
+ crawl_params['allowExternalLinks'] = allow_external_links
3235
+ if ignore_sitemap is not None:
3236
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3237
+ if scrape_options is not None:
3238
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3239
+ if webhook is not None:
3240
+ crawl_params['webhook'] = webhook
3241
+ if deduplicate_similar_urls is not None:
3242
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3243
+ if ignore_query_parameters is not None:
3244
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3245
+ if regex_on_full_url is not None:
3246
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3247
+
3248
+ # Add any additional kwargs
3249
+ crawl_params.update(kwargs)
3250
+
3251
+ # Create final params object
3252
+ final_params = CrawlParams(**crawl_params)
3253
+ params_dict = final_params.dict(exclude_none=True)
3254
+ params_dict['url'] = url
3255
+ params_dict['origin'] = f"python-sdk@{version}"
3256
+
3257
+ # Make request
3258
+ headers = self._prepare_headers(idempotency_key)
3259
+ response = await self._async_post_request(
3260
+ f'{self.api_url}/v1/crawl',
3261
+ params_dict,
3262
+ headers
3263
+ )
3264
+
3265
+ if response.status_code == 200:
3266
+ try:
3267
+ return CrawlResponse(**response.json())
3268
+ except:
3269
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3270
+ else:
3271
+ self._handle_error(response, 'start crawl job')
3272
+
3273
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3274
+ """
3275
+ Check the status and results of an asynchronous crawl job.
3276
+
3277
+ Args:
3278
+ id (str): Unique identifier for the crawl job
3279
+
3280
+ Returns:
3281
+ CrawlStatusResponse containing:
3282
+ Status Information:
3283
+ * status - Current state (scraping/completed/failed/cancelled)
3284
+ * completed - Number of pages crawled
3285
+ * total - Total pages to crawl
3286
+ * creditsUsed - API credits consumed
3287
+ * expiresAt - Data expiration timestamp
3288
+
3289
+ Results:
3290
+ * data - List of crawled documents
3291
+ * next - URL for next page of results (if paginated)
3292
+ * success - Whether status check succeeded
3293
+ * error - Error message if failed
3294
+
3295
+ Raises:
3296
+ Exception: If status check fails
3297
+ """
3298
+ headers = self._prepare_headers()
3299
+ endpoint = f'/v1/crawl/{id}'
3300
+
3301
+ status_data = await self._async_get_request(
3302
+ f'{self.api_url}{endpoint}',
3303
+ headers
3304
+ )
3305
+
3306
+ if status_data['status'] == 'completed':
3307
+ if 'data' in status_data:
3308
+ data = status_data['data']
3309
+ while 'next' in status_data:
3310
+ if len(status_data['data']) == 0:
3311
+ break
3312
+ next_url = status_data.get('next')
3313
+ if not next_url:
3314
+ logger.warning("Expected 'next' URL is missing.")
3315
+ break
3316
+ next_data = await self._async_get_request(next_url, headers)
3317
+ data.extend(next_data.get('data', []))
3318
+ status_data = next_data
3319
+ status_data['data'] = data
3320
+
3321
+ response = {
3322
+ 'status': status_data.get('status'),
3323
+ 'total': status_data.get('total'),
3324
+ 'completed': status_data.get('completed'),
3325
+ 'creditsUsed': status_data.get('creditsUsed'),
3326
+ 'expiresAt': status_data.get('expiresAt'),
3327
+ 'data': status_data.get('data')
3328
+ }
3329
+
3330
+ if 'error' in status_data:
3331
+ response['error'] = status_data['error']
3332
+
3333
+ if 'next' in status_data:
3334
+ response['next'] = status_data['next']
3335
+
3336
+ return {
3337
+ 'success': False if 'error' in status_data else True,
3338
+ **response
3339
+ }
3340
+
3341
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3342
+ """
3343
+ Monitor the status of an asynchronous job until completion.
3344
+
3345
+ Args:
3346
+ id (str): The ID of the job to monitor
3347
+ headers (Dict[str, str]): Headers to include in status check requests
3348
+ poll_interval (int): Seconds between status checks (default: 2)
3349
+
3350
+ Returns:
3351
+ CrawlStatusResponse: The job results if completed successfully
3352
+
3353
+ Raises:
3354
+ Exception: If the job fails or an error occurs during status checks
3355
+ """
3356
+ while True:
3357
+ status_data = await self._async_get_request(
3358
+ f'{self.api_url}/v1/crawl/{id}',
3359
+ headers
3360
+ )
3361
+
3362
+ if status_data['status'] == 'completed':
3363
+ if 'data' in status_data:
3364
+ data = status_data['data']
3365
+ while 'next' in status_data:
3366
+ if len(status_data['data']) == 0:
3367
+ break
3368
+ next_url = status_data.get('next')
3369
+ if not next_url:
3370
+ logger.warning("Expected 'next' URL is missing.")
3371
+ break
3372
+ next_data = await self._async_get_request(next_url, headers)
3373
+ data.extend(next_data.get('data', []))
3374
+ status_data = next_data
3375
+ status_data['data'] = data
3376
+ return status_data
3377
+ else:
3378
+ raise Exception('Job completed but no data was returned')
3379
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3380
+ await asyncio.sleep(max(poll_interval, 2))
3381
+ else:
3382
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3383
+
3384
+ async def map_url(
3385
+ self,
3386
+ url: str,
3387
+ params: Optional[MapParams] = None) -> MapResponse:
3388
+ """
3389
+ Asynchronously map and discover links from a URL.
3390
+
3391
+ Args:
3392
+ url (str): Target URL to map
3393
+ params (Optional[MapParams]): See MapParams model:
3394
+ Discovery Options:
3395
+ * search - Filter pattern for URLs
3396
+ * ignoreSitemap - Skip sitemap.xml
3397
+ * includeSubdomains - Include subdomain links
3398
+ * sitemapOnly - Only use sitemap.xml
3399
+
3400
+ Limits:
3401
+ * limit - Max URLs to return
3402
+ * timeout - Request timeout (ms)
3403
+
3404
+ Returns:
3405
+ MapResponse with:
3406
+ * Discovered URLs
3407
+ * Success/error status
3408
+
3409
+ Raises:
3410
+ Exception: If mapping fails
3411
+ """
3412
+ headers = self._prepare_headers()
3413
+ json_data = {'url': url}
3414
+ if params:
3415
+ json_data.update(params)
3416
+ json_data['origin'] = f"python-sdk@{version}"
3417
+
3418
+ endpoint = f'/v1/map'
3419
+ response = await self._async_post_request(
3420
+ f'{self.api_url}{endpoint}',
3421
+ json_data,
3422
+ headers
3423
+ )
3424
+
3425
+ if response.get('success') and 'links' in response:
3426
+ return response
3427
+ elif 'error' in response:
3428
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3429
+ else:
3430
+ raise Exception(f'Failed to map URL. Error: {response}')
3431
+
3432
+ async def extract(
3433
+ self,
3434
+ urls: List[str],
3435
+ params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
3436
+ """
3437
+ Asynchronously extract structured information from URLs.
3438
+
3439
+ Args:
3440
+ urls (List[str]): URLs to extract from
3441
+ params (Optional[ExtractParams]): See ExtractParams model:
3442
+ Extraction Config:
3443
+ * prompt - Custom extraction prompt
3444
+ * schema - JSON schema/Pydantic model
3445
+ * systemPrompt - System context
3446
+
3447
+ Behavior Options:
3448
+ * allowExternalLinks - Follow external links
3449
+ * enableWebSearch - Enable web search
3450
+ * includeSubdomains - Include subdomains
3451
+ * showSources - Include source URLs
3452
+
3453
+ Scraping Options:
3454
+ * scrapeOptions - Page scraping config
3455
+
3456
+ Returns:
3457
+ ExtractResponse with:
3458
+ * Structured data matching schema
3459
+ * Source information if requested
3460
+ * Success/error status
3461
+
3462
+ Raises:
3463
+ ValueError: If prompt/schema missing or extraction fails
3464
+ """
3465
+ headers = self._prepare_headers()
3466
+
3467
+ if not params or (not params.get('prompt') and not params.get('schema')):
3468
+ raise ValueError("Either prompt or schema is required")
3469
+
3470
+ schema = params.get('schema')
3471
+ if schema:
3472
+ if hasattr(schema, 'model_json_schema'):
3473
+ schema = schema.model_json_schema()
3474
+
3475
+ request_data = {
3476
+ 'urls': urls,
3477
+ 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
3478
+ 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)),
3479
+ 'showSources': params.get('show_sources', params.get('showSources', False)),
3480
+ 'schema': schema,
3481
+ 'origin': f'python-sdk@{version}'
3482
+ }
3483
+
3484
+ if params.get('prompt'):
3485
+ request_data['prompt'] = params['prompt']
3486
+ if params.get('system_prompt'):
3487
+ request_data['systemPrompt'] = params['system_prompt']
3488
+ elif params.get('systemPrompt'):
3489
+ request_data['systemPrompt'] = params['systemPrompt']
3490
+
3491
+ response = await self._async_post_request(
3492
+ f'{self.api_url}/v1/extract',
3493
+ request_data,
3494
+ headers
3495
+ )
3496
+
3497
+ if response.get('success'):
3498
+ job_id = response.get('id')
3499
+ if not job_id:
3500
+ raise Exception('Job ID not returned from extract request.')
3501
+
3502
+ while True:
3503
+ status_data = await self._async_get_request(
3504
+ f'{self.api_url}/v1/extract/{job_id}',
3505
+ headers
3506
+ )
3507
+
3508
+ if status_data['status'] == 'completed':
3509
+ return status_data
3510
+ elif status_data['status'] in ['failed', 'cancelled']:
3511
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3512
+
3513
+ await asyncio.sleep(2)
3514
+ else:
3515
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3516
+
3517
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3518
+ """
3519
+ Check the status of an asynchronous batch scrape job.
3520
+
3521
+ Args:
3522
+ id (str): The ID of the batch scrape job
3523
+
3524
+ Returns:
3525
+ BatchScrapeStatusResponse containing:
3526
+ Status Information:
3527
+ * status - Current state (scraping/completed/failed/cancelled)
3528
+ * completed - Number of URLs scraped
3529
+ * total - Total URLs to scrape
3530
+ * creditsUsed - API credits consumed
3531
+ * expiresAt - Data expiration timestamp
3532
+
3533
+ Results:
3534
+ * data - List of scraped documents
3535
+ * next - URL for next page of results (if paginated)
3536
+ * success - Whether status check succeeded
3537
+ * error - Error message if failed
3538
+
3539
+ Raises:
3540
+ Exception: If status check fails
3541
+ """
3542
+ headers = self._prepare_headers()
3543
+ endpoint = f'/v1/batch/scrape/{id}'
3544
+
3545
+ status_data = await self._async_get_request(
3546
+ f'{self.api_url}{endpoint}',
3547
+ headers
3548
+ )
3549
+
3550
+ if status_data['status'] == 'completed':
3551
+ if 'data' in status_data:
3552
+ data = status_data['data']
3553
+ while 'next' in status_data:
3554
+ if len(status_data['data']) == 0:
3555
+ break
3556
+ next_url = status_data.get('next')
3557
+ if not next_url:
3558
+ logger.warning("Expected 'next' URL is missing.")
3559
+ break
3560
+ next_data = await self._async_get_request(next_url, headers)
3561
+ data.extend(next_data.get('data', []))
3562
+ status_data = next_data
3563
+ status_data['data'] = data
3564
+
3565
+ response = {
3566
+ 'status': status_data.get('status'),
3567
+ 'total': status_data.get('total'),
3568
+ 'completed': status_data.get('completed'),
3569
+ 'creditsUsed': status_data.get('creditsUsed'),
3570
+ 'expiresAt': status_data.get('expiresAt'),
3571
+ 'data': status_data.get('data')
3572
+ }
3573
+
3574
+ if 'error' in status_data:
3575
+ response['error'] = status_data['error']
3576
+
3577
+ if 'next' in status_data:
3578
+ response['next'] = status_data['next']
3579
+
3580
+ return {
3581
+ 'success': False if 'error' in status_data else True,
3582
+ **response
3583
+ }
3584
+
3585
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3586
+ """
3587
+ Get information about errors from an asynchronous batch scrape job.
3588
+
3589
+ Args:
3590
+ id (str): The ID of the batch scrape job
3591
+
3592
+ Returns:
3593
+ CrawlErrorsResponse containing:
3594
+ errors (List[Dict[str, str]]): List of errors with fields:
3595
+ * id (str): Error ID
3596
+ * timestamp (str): When the error occurred
3597
+ * url (str): URL that caused the error
3598
+ * error (str): Error message
3599
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3600
+
3601
+ Raises:
3602
+ Exception: If error check fails
3603
+ """
3604
+ headers = self._prepare_headers()
3605
+ return await self._async_get_request(
3606
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3607
+ headers
3608
+ )
3609
+
3610
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3611
+ """
3612
+ Get information about errors from an asynchronous crawl job.
3613
+
3614
+ Args:
3615
+ id (str): The ID of the crawl job
3616
+
3617
+ Returns:
3618
+ CrawlErrorsResponse containing:
3619
+ * errors (List[Dict[str, str]]): List of errors with fields:
3620
+ - id (str): Error ID
3621
+ - timestamp (str): When the error occurred
3622
+ - url (str): URL that caused the error
3623
+ - error (str): Error message
3624
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3625
+
3626
+ Raises:
3627
+ Exception: If error check fails
3628
+ """
3629
+ headers = self._prepare_headers()
3630
+ return await self._async_get_request(
3631
+ f'{self.api_url}/v1/crawl/{id}/errors',
3632
+ headers
3633
+ )
3634
+
3635
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3636
+ """
3637
+ Cancel an asynchronous crawl job.
3638
+
3639
+ Args:
3640
+ id (str): The ID of the crawl job to cancel
3641
+
3642
+ Returns:
3643
+ Dict[str, Any] containing:
3644
+ * success (bool): Whether cancellation was successful
3645
+ * error (str, optional): Error message if cancellation failed
3646
+
3647
+ Raises:
3648
+ Exception: If cancellation fails
3649
+ """
3650
+ headers = self._prepare_headers()
3651
+ async with aiohttp.ClientSession() as session:
3652
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3653
+ return await response.json()
3654
+
3655
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3656
+ """
3657
+ Check the status of an asynchronous extraction job.
3658
+
3659
+ Args:
3660
+ job_id (str): The ID of the extraction job
3661
+
3662
+ Returns:
3663
+ ExtractResponse[Any] with:
3664
+ * success (bool): Whether request succeeded
3665
+ * data (Optional[Any]): Extracted data matching schema
3666
+ * error (Optional[str]): Error message if any
3667
+ * warning (Optional[str]): Warning message if any
3668
+ * sources (Optional[List[str]]): Source URLs if requested
3669
+
3670
+ Raises:
3671
+ ValueError: If status check fails
3672
+ """
3673
+ headers = self._prepare_headers()
3674
+ try:
3675
+ return await self._async_get_request(
3676
+ f'{self.api_url}/v1/extract/{job_id}',
3677
+ headers
3678
+ )
3679
+ except Exception as e:
3680
+ raise ValueError(str(e))
3681
+
3682
+ async def async_extract(
3683
+ self,
3684
+ urls: Optional[List[str]] = None,
3685
+ *,
3686
+ prompt: Optional[str] = None,
3687
+ schema: Optional[Any] = None,
3688
+ system_prompt: Optional[str] = None,
3689
+ allow_external_links: Optional[bool] = False,
3690
+ enable_web_search: Optional[bool] = False,
3691
+ show_sources: Optional[bool] = False,
3692
+ agent: Optional[Dict[str, Any]] = None,
3693
+ idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
3694
+ """
3695
+ Initiate an asynchronous extraction job without waiting for completion.
3696
+
3697
+ Args:
3698
+ urls (Optional[List[str]]): URLs to extract from
3699
+ prompt (Optional[str]): Custom extraction prompt
3700
+ schema (Optional[Any]): JSON schema/Pydantic model
3701
+ system_prompt (Optional[str]): System context
3702
+ allow_external_links (Optional[bool]): Follow external links
3703
+ enable_web_search (Optional[bool]): Enable web search
3704
+ show_sources (Optional[bool]): Include source URLs
3705
+ agent (Optional[Dict[str, Any]]): Agent configuration
3706
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3707
+
3708
+ Returns:
3709
+ ExtractResponse[Any] with:
3710
+ * success (bool): Whether request succeeded
3711
+ * data (Optional[Any]): Extracted data matching schema
3712
+ * error (Optional[str]): Error message if any
3713
+
3714
+ Raises:
3715
+ ValueError: If job initiation fails
3716
+ """
3717
+ headers = self._prepare_headers(idempotency_key)
3718
+
3719
+ if not prompt and not schema:
3720
+ raise ValueError("Either prompt or schema is required")
3721
+
3722
+ if not urls and not prompt:
3723
+ raise ValueError("Either urls or prompt is required")
3724
+
3725
+ if schema:
3726
+ if hasattr(schema, 'model_json_schema'):
3727
+ schema = schema.model_json_schema()
3728
+
3729
+ request_data = {
3730
+ 'urls': urls or [],
3731
+ 'allowExternalLinks': allow_external_links,
3732
+ 'enableWebSearch': enable_web_search,
3733
+ 'showSources': show_sources,
3734
+ 'schema': schema,
3735
+ 'origin': f'python-sdk@{version}'
3736
+ }
3737
+
3738
+ if prompt:
3739
+ request_data['prompt'] = prompt
3740
+ if system_prompt:
3741
+ request_data['systemPrompt'] = system_prompt
3742
+ if agent:
3743
+ request_data['agent'] = agent
3744
+
3745
+ try:
3746
+ return await self._async_post_request(
3747
+ f'{self.api_url}/v1/extract',
3748
+ request_data,
3749
+ headers
3750
+ )
3751
+ except Exception as e:
3752
+ raise ValueError(str(e))
3753
+
3754
+ async def generate_llms_text(
3755
+ self,
3756
+ url: str,
3757
+ *,
3758
+ max_urls: Optional[int] = None,
3759
+ show_full_text: Optional[bool] = None,
3760
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3761
+ """
3762
+ Generate LLMs.txt for a given URL and monitor until completion.
3763
+
3764
+ Args:
3765
+ url (str): Target URL to generate LLMs.txt from
3766
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3767
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3768
+ experimental_stream (Optional[bool]): Enable experimental streaming
3769
+
3770
+ Returns:
3771
+ GenerateLLMsTextStatusResponse containing:
3772
+ * success (bool): Whether generation completed successfully
3773
+ * status (str): Status of generation (processing/completed/failed)
3774
+ * data (Dict[str, str], optional): Generated text with fields:
3775
+ - llmstxt (str): Generated LLMs.txt content
3776
+ - llmsfulltxt (str, optional): Full version if requested
3777
+ * error (str, optional): Error message if generation failed
3778
+ * expiresAt (str): When the generated data expires
3779
+
3780
+ Raises:
3781
+ Exception: If generation fails
3782
+ """
3783
+ params = {}
3784
+ if max_urls is not None:
3785
+ params['maxUrls'] = max_urls
3786
+ if show_full_text is not None:
3787
+ params['showFullText'] = show_full_text
3788
+ if experimental_stream is not None:
3789
+ params['__experimental_stream'] = experimental_stream
3790
+
3791
+ response = await self.async_generate_llms_text(
3792
+ url,
3793
+ max_urls=max_urls,
3794
+ show_full_text=show_full_text,
3795
+ experimental_stream=experimental_stream
3796
+ )
3797
+ if not response.get('success') or 'id' not in response:
3798
+ return response
3799
+
3800
+ job_id = response['id']
3801
+ while True:
3802
+ status = await self.check_generate_llms_text_status(job_id)
3803
+
3804
+ if status['status'] == 'completed':
3805
+ return status
3806
+ elif status['status'] == 'failed':
3807
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
3808
+ elif status['status'] != 'processing':
3809
+ break
3810
+
3811
+ await asyncio.sleep(2)
3812
+
3813
+ return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
3814
+
3815
+ async def async_generate_llms_text(
3816
+ self,
3817
+ url: str,
3818
+ *,
3819
+ max_urls: Optional[int] = None,
3820
+ show_full_text: Optional[bool] = None,
3821
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
3822
+ """
3823
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
3824
+
3825
+ Args:
3826
+ url (str): Target URL to generate LLMs.txt from
3827
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3828
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3829
+ experimental_stream (Optional[bool]): Enable experimental streaming
3830
+
3831
+ Returns:
3832
+ GenerateLLMsTextResponse containing:
3833
+ * success (bool): Whether job started successfully
3834
+ * id (str): Unique identifier for the job
3835
+ * error (str, optional): Error message if start failed
3836
+
3837
+ Raises:
3838
+ ValueError: If job initiation fails
3839
+ """
3840
+ params = {}
3841
+ if max_urls is not None:
3842
+ params['maxUrls'] = max_urls
3843
+ if show_full_text is not None:
3844
+ params['showFullText'] = show_full_text
3845
+ if experimental_stream is not None:
3846
+ params['__experimental_stream'] = experimental_stream
3847
+
3848
+ headers = self._prepare_headers()
3849
+ json_data = {'url': url, **params.dict(exclude_none=True)}
3850
+ json_data['origin'] = f"python-sdk@{version}"
3851
+
3852
+ try:
3853
+ return await self._async_post_request(
3854
+ f'{self.api_url}/v1/llmstxt',
3855
+ json_data,
3856
+ headers
3857
+ )
3858
+ except Exception as e:
3859
+ raise ValueError(str(e))
3860
+
3861
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
3862
+ """
3863
+ Check the status of an asynchronous LLMs.txt generation job.
3864
+
3865
+ Args:
3866
+ id (str): The ID of the generation job
3867
+
3868
+ Returns:
3869
+ GenerateLLMsTextStatusResponse containing:
3870
+ * success (bool): Whether generation completed successfully
3871
+ * status (str): Status of generation (processing/completed/failed)
3872
+ * data (Dict[str, str], optional): Generated text with fields:
3873
+ - llmstxt (str): Generated LLMs.txt content
3874
+ - llmsfulltxt (str, optional): Full version if requested
3875
+ * error (str, optional): Error message if generation failed
3876
+ * expiresAt (str): When the generated data expires
3877
+
3878
+ Raises:
3879
+ ValueError: If status check fails
3880
+ """
3881
+ headers = self._prepare_headers()
3882
+ try:
3883
+ return await self._async_get_request(
3884
+ f'{self.api_url}/v1/llmstxt/{id}',
3885
+ headers
3886
+ )
3887
+ except Exception as e:
3888
+ raise ValueError(str(e))
3889
+
3890
+ async def deep_research(
3891
+ self,
3892
+ query: str,
3893
+ *,
3894
+ max_depth: Optional[int] = None,
3895
+ time_limit: Optional[int] = None,
3896
+ max_urls: Optional[int] = None,
3897
+ analysis_prompt: Optional[str] = None,
3898
+ system_prompt: Optional[str] = None,
3899
+ __experimental_stream_steps: Optional[bool] = None,
3900
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
3901
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
3902
+ """
3903
+ Initiates a deep research operation on a given query and polls until completion.
3904
+
3905
+ Args:
3906
+ query (str): Research query or topic to investigate
3907
+ max_depth (Optional[int]): Maximum depth of research exploration
3908
+ time_limit (Optional[int]): Time limit in seconds for research
3909
+ max_urls (Optional[int]): Maximum number of URLs to process
3910
+ analysis_prompt (Optional[str]): Custom prompt for analysis
3911
+ system_prompt (Optional[str]): Custom system prompt
3912
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
3913
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
3914
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
3915
+
3916
+ Returns:
3917
+ DeepResearchStatusResponse containing:
3918
+ * success (bool): Whether research completed successfully
3919
+ * status (str): Current state (processing/completed/failed)
3920
+ * error (Optional[str]): Error message if failed
3921
+ * id (str): Unique identifier for the research job
3922
+ * data (Any): Research findings and analysis
3923
+ * sources (List[Dict]): List of discovered sources
3924
+ * activities (List[Dict]): Research progress log
3925
+ * summaries (List[str]): Generated research summaries
3926
+
3927
+ Raises:
3928
+ Exception: If research fails
3929
+ """
3930
+ research_params = {}
3931
+ if max_depth is not None:
3932
+ research_params['maxDepth'] = max_depth
3933
+ if time_limit is not None:
3934
+ research_params['timeLimit'] = time_limit
3935
+ if max_urls is not None:
3936
+ research_params['maxUrls'] = max_urls
3937
+ if analysis_prompt is not None:
3938
+ research_params['analysisPrompt'] = analysis_prompt
3939
+ if system_prompt is not None:
3940
+ research_params['systemPrompt'] = system_prompt
3941
+ if __experimental_stream_steps is not None:
3942
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
3943
+ research_params = DeepResearchParams(**research_params)
3944
+
3945
+ response = await self.async_deep_research(
3946
+ query,
3947
+ max_depth=max_depth,
3948
+ time_limit=time_limit,
3949
+ max_urls=max_urls,
3950
+ analysis_prompt=analysis_prompt,
3951
+ system_prompt=system_prompt
3952
+ )
3953
+ if not response.get('success') or 'id' not in response:
3954
+ return response
3955
+
3956
+ job_id = response['id']
3957
+ last_activity_count = 0
3958
+ last_source_count = 0
3959
+
3960
+ while True:
3961
+ status = await self.check_deep_research_status(job_id)
3962
+
3963
+ if on_activity and 'activities' in status:
3964
+ new_activities = status['activities'][last_activity_count:]
3965
+ for activity in new_activities:
3966
+ on_activity(activity)
3967
+ last_activity_count = len(status['activities'])
3968
+
3969
+ if on_source and 'sources' in status:
3970
+ new_sources = status['sources'][last_source_count:]
3971
+ for source in new_sources:
3972
+ on_source(source)
3973
+ last_source_count = len(status['sources'])
3974
+
3975
+ if status['status'] == 'completed':
3976
+ return status
3977
+ elif status['status'] == 'failed':
3978
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
3979
+ elif status['status'] != 'processing':
3980
+ break
3981
+
3982
+ await asyncio.sleep(2)
3983
+
3984
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
3985
+
3986
+ async def async_deep_research(
3987
+ self,
3988
+ query: str,
3989
+ *,
3990
+ max_depth: Optional[int] = None,
3991
+ time_limit: Optional[int] = None,
3992
+ max_urls: Optional[int] = None,
3993
+ analysis_prompt: Optional[str] = None,
3994
+ system_prompt: Optional[str] = None,
3995
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
3996
+ """
3997
+ Initiates an asynchronous deep research operation.
3998
+
3999
+ Args:
4000
+ query (str): Research query or topic to investigate
4001
+ max_depth (Optional[int]): Maximum depth of research exploration
4002
+ time_limit (Optional[int]): Time limit in seconds for research
4003
+ max_urls (Optional[int]): Maximum number of URLs to process
4004
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4005
+ system_prompt (Optional[str]): Custom system prompt
4006
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4007
+
4008
+ Returns:
4009
+ Dict[str, Any]: A response containing:
4010
+ * success (bool): Whether the research initiation was successful
4011
+ * id (str): The unique identifier for the research job
4012
+ * error (str, optional): Error message if initiation failed
4013
+
4014
+ Raises:
4015
+ Exception: If the research initiation fails.
4016
+ """
4017
+ research_params = {}
4018
+ if max_depth is not None:
4019
+ research_params['maxDepth'] = max_depth
4020
+ if time_limit is not None:
4021
+ research_params['timeLimit'] = time_limit
4022
+ if max_urls is not None:
4023
+ research_params['maxUrls'] = max_urls
4024
+ if analysis_prompt is not None:
4025
+ research_params['analysisPrompt'] = analysis_prompt
4026
+ if system_prompt is not None:
4027
+ research_params['systemPrompt'] = system_prompt
4028
+ if __experimental_stream_steps is not None:
4029
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4030
+ research_params = DeepResearchParams(**research_params)
4031
+
4032
+ headers = self._prepare_headers()
4033
+
4034
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4035
+ json_data['origin'] = f"python-sdk@{version}"
4036
+
4037
+ try:
4038
+ return await self._async_post_request(
4039
+ f'{self.api_url}/v1/deep-research',
4040
+ json_data,
4041
+ headers
4042
+ )
4043
+ except Exception as e:
4044
+ raise ValueError(str(e))
4045
+
4046
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4047
+ """
4048
+ Check the status of a deep research operation.
4049
+
4050
+ Args:
4051
+ id (str): The ID of the deep research operation.
4052
+
4053
+ Returns:
4054
+ DeepResearchResponse containing:
4055
+
4056
+ Status:
4057
+ * success - Whether research completed successfully
4058
+ * status - Current state (processing/completed/failed)
4059
+ * error - Error message if failed
4060
+
4061
+ Results:
4062
+ * id - Unique identifier for the research job
4063
+ * data - Research findings and analysis
4064
+ * sources - List of discovered sources
4065
+ * activities - Research progress log
4066
+ * summaries - Generated research summaries
4067
+
4068
+ Raises:
4069
+ Exception: If the status check fails.
4070
+ """
4071
+ headers = self._prepare_headers()
4072
+ try:
4073
+ return await self._async_get_request(
4074
+ f'{self.api_url}/v1/deep-research/{id}',
4075
+ headers
4076
+ )
4077
+ except Exception as e:
4078
+ raise ValueError(str(e))
4079
+
4080
+ async def search(
4081
+ self,
4082
+ query: str,
4083
+ *,
4084
+ limit: Optional[int] = None,
4085
+ tbs: Optional[str] = None,
4086
+ filter: Optional[str] = None,
4087
+ lang: Optional[str] = None,
4088
+ country: Optional[str] = None,
4089
+ location: Optional[str] = None,
4090
+ timeout: Optional[int] = None,
4091
+ scrape_options: Optional[CommonOptions] = None,
4092
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4093
+ **kwargs) -> SearchResponse:
4094
+ """
4095
+ Asynchronously search for content using Firecrawl.
4096
+
4097
+ Args:
4098
+ query (str): Search query string
4099
+ limit (Optional[int]): Max results (default: 5)
4100
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4101
+ filter (Optional[str]): Custom result filter
4102
+ lang (Optional[str]): Language code (default: "en")
4103
+ country (Optional[str]): Country code (default: "us")
4104
+ location (Optional[str]): Geo-targeting
4105
+ timeout (Optional[int]): Request timeout in milliseconds
4106
+ scrape_options (Optional[CommonOptions]): Result scraping configuration
4107
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4108
+ **kwargs: Additional keyword arguments for future compatibility
4109
+
4110
+ Returns:
4111
+ SearchResponse: Response containing:
4112
+ * success (bool): Whether request succeeded
4113
+ * data (List[FirecrawlDocument]): Search results
4114
+ * warning (Optional[str]): Warning message if any
4115
+ * error (Optional[str]): Error message if any
4116
+
4117
+ Raises:
4118
+ Exception: If search fails or response cannot be parsed
4119
+ """
4120
+ # Build search parameters
4121
+ search_params = {}
4122
+ if params:
4123
+ if isinstance(params, dict):
4124
+ search_params.update(params)
4125
+ else:
4126
+ search_params.update(params.dict(exclude_none=True))
4127
+
4128
+ # Add individual parameters
4129
+ if limit is not None:
4130
+ search_params['limit'] = limit
4131
+ if tbs is not None:
4132
+ search_params['tbs'] = tbs
4133
+ if filter is not None:
4134
+ search_params['filter'] = filter
4135
+ if lang is not None:
4136
+ search_params['lang'] = lang
4137
+ if country is not None:
4138
+ search_params['country'] = country
4139
+ if location is not None:
4140
+ search_params['location'] = location
4141
+ if timeout is not None:
4142
+ search_params['timeout'] = timeout
4143
+ if scrape_options is not None:
4144
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4145
+
4146
+ # Add any additional kwargs
4147
+ search_params.update(kwargs)
4148
+
4149
+ # Create final params object
4150
+ final_params = SearchParams(query=query, **search_params)
4151
+ params_dict = final_params.dict(exclude_none=True)
4152
+ params_dict['origin'] = f"python-sdk@{version}"
4153
+
4154
+ return await self._async_post_request(
4155
+ f"{self.api_url}/v1/search",
4156
+ params_dict,
4157
+ {"Authorization": f"Bearer {self.api_key}"}
4158
+ )
4159
+
4160
+ class AsyncCrawlWatcher(CrawlWatcher):
4161
+ """
4162
+ Async version of CrawlWatcher that properly handles async operations.
4163
+ """
4164
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4165
+ super().__init__(id, app)
4166
+
4167
+ async def connect(self) -> None:
4168
+ """
4169
+ Establishes async WebSocket connection and starts listening for messages.
4170
+ """
4171
+ async with websockets.connect(
4172
+ self.ws_url,
4173
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4174
+ ) as websocket:
4175
+ await self._listen(websocket)
4176
+
4177
+ async def _listen(self, websocket) -> None:
4178
+ """
4179
+ Listens for incoming WebSocket messages and handles them asynchronously.
4180
+
4181
+ Args:
4182
+ websocket: The WebSocket connection object
4183
+ """
4184
+ async for message in websocket:
4185
+ msg = json.loads(message)
4186
+ await self._handle_message(msg)
4187
+
4188
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4189
+ """
4190
+ Handles incoming WebSocket messages based on their type asynchronously.
4191
+
4192
+ Args:
4193
+ msg (Dict[str, Any]): The message to handle
4194
+ """
4195
+ if msg['type'] == 'done':
4196
+ self.status = 'completed'
4197
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4198
+ elif msg['type'] == 'error':
4199
+ self.status = 'failed'
4200
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4201
+ elif msg['type'] == 'catchup':
4202
+ self.status = msg['data']['status']
4203
+ self.data.extend(msg['data'].get('data', []))
4204
+ for doc in self.data:
4205
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4206
+ elif msg['type'] == 'document':
4207
+ self.data.append(msg['data'])
4208
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4209
+
4210
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4211
+ """
4212
+ Handle errors from async API responses.
4213
+ """
4214
+ try:
4215
+ error_data = await response.json()
4216
+ error_message = error_data.get('error', 'No error message provided.')
4217
+ error_details = error_data.get('details', 'No additional error details provided.')
4218
+ except:
4219
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4220
+
4221
+ # Use the app's method to get the error message
4222
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4223
+
4224
+ raise aiohttp.ClientError(message)
4225
+
4226
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4227
+ """
4228
+ Generate a standardized error message based on HTTP status code for async operations.
4229
+
4230
+ Args:
4231
+ status_code (int): The HTTP status code from the response
4232
+ action (str): Description of the action that was being performed
4233
+ error_message (str): The error message from the API response
4234
+ error_details (str): Additional error details from the API response
4235
+
4236
+ Returns:
4237
+ str: A formatted error message
4238
+ """
4239
+ return self._get_error_message(status_code, action, error_message, error_details)