firecrawl 1.17.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/firecrawl.py CHANGED
@@ -12,15 +12,294 @@ Classes:
12
12
  import logging
13
13
  import os
14
14
  import time
15
- from typing import Any, Dict, Optional, List, Union, Callable
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
16
  import json
17
-
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
18
20
  import requests
19
21
  import pydantic
20
22
  import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
21
47
 
22
48
  logger : logging.Logger = logging.getLogger("firecrawl")
23
49
 
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+
88
+ class AgentOptions(pydantic.BaseModel):
89
+ """Configuration for the agent."""
90
+ model: Literal["FIRE-1"] = "FIRE-1"
91
+ prompt: Optional[str] = None
92
+
93
+ class AgentOptionsExtract(pydantic.BaseModel):
94
+ """Configuration for the agent in extract operations."""
95
+ model: Literal["FIRE-1"] = "FIRE-1"
96
+
97
+ class ActionsResult(pydantic.BaseModel):
98
+ """Result of actions performed during scraping."""
99
+ screenshots: List[str]
100
+
101
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
102
+ """Document retrieved or processed by Firecrawl."""
103
+ url: Optional[str] = None
104
+ markdown: Optional[str] = None
105
+ html: Optional[str] = None
106
+ rawHtml: Optional[str] = None
107
+ links: Optional[List[str]] = None
108
+ extract: Optional[T] = None
109
+ json: Optional[T] = None
110
+ screenshot: Optional[str] = None
111
+ metadata: Optional[Any] = None
112
+ actions: Optional[ActionsResult] = None
113
+ title: Optional[str] = None # v1 search only
114
+ description: Optional[str] = None # v1 search only
115
+
116
+ class LocationConfig(pydantic.BaseModel):
117
+ """Location configuration for scraping."""
118
+ country: Optional[str] = None
119
+ languages: Optional[List[str]] = None
120
+
121
+ class WebhookConfig(pydantic.BaseModel):
122
+ """Configuration for webhooks."""
123
+ url: str
124
+ headers: Optional[Dict[str, str]] = None
125
+ metadata: Optional[Dict[str, str]] = None
126
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
127
+
128
+ class CommonOptions(pydantic.BaseModel):
129
+ """Parameters for scraping operations."""
130
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
131
+ headers: Optional[Dict[str, str]] = None
132
+ includeTags: Optional[List[str]] = None
133
+ excludeTags: Optional[List[str]] = None
134
+ onlyMainContent: Optional[bool] = None
135
+ waitFor: Optional[int] = None
136
+ timeout: Optional[int] = None
137
+ location: Optional[LocationConfig] = None
138
+ mobile: Optional[bool] = None
139
+ skipTlsVerification: Optional[bool] = None
140
+ removeBase64Images: Optional[bool] = None
141
+ blockAds: Optional[bool] = None
142
+ proxy: Optional[Literal["basic", "stealth"]] = None
143
+
144
+ class WaitAction(pydantic.BaseModel):
145
+ """Wait action to perform during scraping."""
146
+ type: Literal["wait"]
147
+ milliseconds: int
148
+ selector: Optional[str] = None
149
+
150
+ class ScreenshotAction(pydantic.BaseModel):
151
+ """Screenshot action to perform during scraping."""
152
+ type: Literal["screenshot"]
153
+ fullPage: Optional[bool] = None
154
+
155
+ class ClickAction(pydantic.BaseModel):
156
+ """Click action to perform during scraping."""
157
+ type: Literal["click"]
158
+ selector: str
159
+
160
+ class WriteAction(pydantic.BaseModel):
161
+ """Write action to perform during scraping."""
162
+ type: Literal["write"]
163
+ text: str
164
+
165
+ class PressAction(pydantic.BaseModel):
166
+ """Press action to perform during scraping."""
167
+ type: Literal["press"]
168
+ key: str
169
+
170
+ class ScrollAction(pydantic.BaseModel):
171
+ """Scroll action to perform during scraping."""
172
+ type: Literal["scroll"]
173
+ direction: Literal["up", "down"]
174
+ selector: Optional[str] = None
175
+
176
+ class ScrapeAction(pydantic.BaseModel):
177
+ """Scrape action to perform during scraping."""
178
+ type: Literal["scrape"]
179
+
180
+ class ExecuteJavascriptAction(pydantic.BaseModel):
181
+ """Execute javascript action to perform during scraping."""
182
+ type: Literal["executeJavascript"]
183
+ script: str
184
+
185
+
186
+ class ExtractAgent(pydantic.BaseModel):
187
+ """Configuration for the agent in extract operations."""
188
+ model: Literal["FIRE-1"] = "FIRE-1"
189
+
190
+ class ExtractConfig(pydantic.BaseModel):
191
+ """Configuration for extraction."""
192
+ prompt: Optional[str] = None
193
+ schema: Optional[Any] = None
194
+ systemPrompt: Optional[str] = None
195
+ agent: Optional[ExtractAgent] = None
196
+
197
+ class ScrapeParams(CommonOptions):
198
+ """Parameters for scraping operations."""
199
+ extract: Optional[ExtractConfig] = None
200
+ jsonOptions: Optional[ExtractConfig] = None
201
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
202
+ agent: Optional[AgentOptions] = None
203
+
204
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
205
+ """Response from scraping operations."""
206
+ success: bool = True
207
+ warning: Optional[str] = None
208
+ error: Optional[str] = None
209
+
210
+ class BatchScrapeResponse(pydantic.BaseModel):
211
+ """Response from batch scrape operations."""
212
+ id: Optional[str] = None
213
+ url: Optional[str] = None
214
+ success: bool = True
215
+ error: Optional[str] = None
216
+ invalidURLs: Optional[List[str]] = None
217
+
218
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
219
+ """Response from batch scrape status checks."""
220
+ success: bool = True
221
+ status: Literal["scraping", "completed", "failed", "cancelled"]
222
+ completed: int
223
+ total: int
224
+ creditsUsed: int
225
+ expiresAt: datetime
226
+ next: Optional[str] = None
227
+ data: List[FirecrawlDocument]
228
+
229
+ class CrawlParams(pydantic.BaseModel):
230
+ """Parameters for crawling operations."""
231
+ includePaths: Optional[List[str]] = None
232
+ excludePaths: Optional[List[str]] = None
233
+ maxDepth: Optional[int] = None
234
+ maxDiscoveryDepth: Optional[int] = None
235
+ limit: Optional[int] = None
236
+ allowBackwardLinks: Optional[bool] = None
237
+ allowExternalLinks: Optional[bool] = None
238
+ ignoreSitemap: Optional[bool] = None
239
+ scrapeOptions: Optional[CommonOptions] = None
240
+ webhook: Optional[Union[str, WebhookConfig]] = None
241
+ deduplicateSimilarURLs: Optional[bool] = None
242
+ ignoreQueryParameters: Optional[bool] = None
243
+ regexOnFullURL: Optional[bool] = None
244
+
245
+ class CrawlResponse(pydantic.BaseModel):
246
+ """Response from crawling operations."""
247
+ id: Optional[str] = None
248
+ url: Optional[str] = None
249
+ success: bool = True
250
+ error: Optional[str] = None
251
+
252
+ class CrawlStatusResponse(pydantic.BaseModel):
253
+ """Response from crawl status checks."""
254
+ success: bool = True
255
+ status: Literal["scraping", "completed", "failed", "cancelled"]
256
+ completed: int
257
+ total: int
258
+ creditsUsed: int
259
+ expiresAt: datetime
260
+ next: Optional[str] = None
261
+ data: List[FirecrawlDocument]
262
+
263
+ class CrawlErrorsResponse(pydantic.BaseModel):
264
+ """Response from crawl/batch scrape error monitoring."""
265
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
266
+ robotsBlocked: List[str]
267
+
268
+ class MapParams(pydantic.BaseModel):
269
+ """Parameters for mapping operations."""
270
+ search: Optional[str] = None
271
+ ignoreSitemap: Optional[bool] = None
272
+ includeSubdomains: Optional[bool] = None
273
+ sitemapOnly: Optional[bool] = None
274
+ limit: Optional[int] = None
275
+ timeout: Optional[int] = None
276
+
277
+ class MapResponse(pydantic.BaseModel):
278
+ """Response from mapping operations."""
279
+ success: bool = True
280
+ links: Optional[List[str]] = None
281
+ error: Optional[str] = None
282
+
283
+ class ExtractParams(pydantic.BaseModel):
284
+ """Parameters for extracting information from URLs."""
285
+ prompt: Optional[str] = None
286
+ schema: Optional[Any] = None
287
+ systemPrompt: Optional[str] = None
288
+ allowExternalLinks: Optional[bool] = None
289
+ enableWebSearch: Optional[bool] = None
290
+ includeSubdomains: Optional[bool] = None
291
+ origin: Optional[str] = None
292
+ showSources: Optional[bool] = None
293
+ scrapeOptions: Optional[CommonOptions] = None
294
+
295
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
296
+ """Response from extract operations."""
297
+ success: bool = True
298
+ data: Optional[T] = None
299
+ error: Optional[str] = None
300
+ warning: Optional[str] = None
301
+ sources: Optional[List[str]] = None
302
+
24
303
  class SearchParams(pydantic.BaseModel):
25
304
  query: str
26
305
  limit: Optional[int] = 5
@@ -31,7 +310,14 @@ class SearchParams(pydantic.BaseModel):
31
310
  location: Optional[str] = None
32
311
  origin: Optional[str] = "api"
33
312
  timeout: Optional[int] = 60000
34
- scrapeOptions: Optional[Dict[str, Any]] = None
313
+ scrapeOptions: Optional[CommonOptions] = None
314
+
315
+ class SearchResponse(pydantic.BaseModel):
316
+ """Response from search operations."""
317
+ success: bool = True
318
+ data: List[FirecrawlDocument]
319
+ warning: Optional[str] = None
320
+ error: Optional[str] = None
35
321
 
36
322
  class GenerateLLMsTextParams(pydantic.BaseModel):
37
323
  """
@@ -75,6 +361,24 @@ class DeepResearchStatusResponse(pydantic.BaseModel):
75
361
  sources: List[Dict[str, Any]]
76
362
  summaries: List[str]
77
363
 
364
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
365
+ """Response from LLMs.txt generation operations."""
366
+ success: bool = True
367
+ id: str
368
+ error: Optional[str] = None
369
+
370
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
371
+ llmstxt: str
372
+ llmsfulltxt: Optional[str] = None
373
+
374
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
375
+ """Status response from LLMs.txt generation operations."""
376
+ success: bool = True
377
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
378
+ status: Literal["processing", "completed", "failed"]
379
+ error: Optional[str] = None
380
+ expiresAt: str
381
+
78
382
  class ChangeTrackingData(pydantic.BaseModel):
79
383
  """
80
384
  Data for the change tracking format.
@@ -84,42 +388,39 @@ class ChangeTrackingData(pydantic.BaseModel):
84
388
  visibility: str # "visible" | "hidden"
85
389
  diff: Optional[Dict[str, Any]] = None
86
390
  json: Optional[Any] = None
391
+
392
+ class SearchResponse(pydantic.BaseModel):
393
+ """
394
+ Response from the search operation.
395
+ """
396
+ success: bool
397
+ data: List[Dict[str, Any]]
398
+ warning: Optional[str] = None
399
+ error: Optional[str] = None
87
400
 
88
- class FirecrawlApp:
89
- class SearchResponse(pydantic.BaseModel):
90
- """
91
- Response from the search operation.
92
- """
93
- success: bool
94
- data: List[Dict[str, Any]]
95
- warning: Optional[str] = None
96
- error: Optional[str] = None
97
-
98
- class ExtractParams(pydantic.BaseModel):
99
- """
100
- Parameters for the extract operation.
101
- """
102
- prompt: Optional[str] = None
103
- schema_: Optional[Any] = pydantic.Field(None, alias='schema')
104
- system_prompt: Optional[str] = None
105
- allow_external_links: Optional[bool] = False
106
- enable_web_search: Optional[bool] = False
107
- # Just for backwards compatibility
108
- enableWebSearch: Optional[bool] = False
109
- show_sources: Optional[bool] = False
110
- agent: Optional[Dict[str, Any]] = None
111
-
112
-
113
-
114
-
115
- class ExtractResponse(pydantic.BaseModel):
116
- """
117
- Response from the extract operation.
118
- """
119
- success: bool
120
- data: Optional[Any] = None
121
- error: Optional[str] = None
401
+ class ExtractParams(pydantic.BaseModel):
402
+ """
403
+ Parameters for the extract operation.
404
+ """
405
+ prompt: Optional[str] = None
406
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
407
+ system_prompt: Optional[str] = None
408
+ allow_external_links: Optional[bool] = False
409
+ enable_web_search: Optional[bool] = False
410
+ # Just for backwards compatibility
411
+ enableWebSearch: Optional[bool] = False
412
+ show_sources: Optional[bool] = False
413
+ agent: Optional[Dict[str, Any]] = None
414
+
415
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
416
+ """
417
+ Response from the extract operation.
418
+ """
419
+ success: bool
420
+ data: Optional[T] = None
421
+ error: Optional[str] = None
122
422
 
423
+ class FirecrawlApp:
123
424
  def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
124
425
  """
125
426
  Initialize the FirecrawlApp instance with API key, API URL.
@@ -138,200 +439,451 @@ class FirecrawlApp:
138
439
 
139
440
  logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
140
441
 
141
- def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
442
+ def scrape_url(
443
+ self,
444
+ url: str,
445
+ *,
446
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
447
+ include_tags: Optional[List[str]] = None,
448
+ exclude_tags: Optional[List[str]] = None,
449
+ only_main_content: Optional[bool] = None,
450
+ wait_for: Optional[int] = None,
451
+ timeout: Optional[int] = None,
452
+ location: Optional[LocationConfig] = None,
453
+ mobile: Optional[bool] = None,
454
+ skip_tls_verification: Optional[bool] = None,
455
+ remove_base64_images: Optional[bool] = None,
456
+ block_ads: Optional[bool] = None,
457
+ proxy: Optional[Literal["basic", "stealth"]] = None,
458
+ extract: Optional[ExtractConfig] = None,
459
+ json_options: Optional[ExtractConfig] = None,
460
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
461
+ **kwargs) -> ScrapeResponse[Any]:
142
462
  """
143
- Scrape the specified URL using the Firecrawl API.
463
+ Scrape and extract content from a URL.
144
464
 
145
465
  Args:
146
- url (str): The URL to scrape.
147
- params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
466
+ url (str): Target URL to scrape
467
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
468
+ include_tags (Optional[List[str]]): HTML tags to include
469
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
470
+ only_main_content (Optional[bool]): Extract main content only
471
+ wait_for (Optional[int]): Wait for a specific element to appear
472
+ timeout (Optional[int]): Request timeout (ms)
473
+ location (Optional[LocationConfig]): Location configuration
474
+ mobile (Optional[bool]): Use mobile user agent
475
+ skip_tls_verification (Optional[bool]): Skip TLS verification
476
+ remove_base64_images (Optional[bool]): Remove base64 images
477
+ block_ads (Optional[bool]): Block ads
478
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
479
+ extract (Optional[ExtractConfig]): Content extraction settings
480
+ json_options (Optional[ExtractConfig]): JSON extraction settings
481
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
482
+
148
483
 
149
484
  Returns:
150
- Any: The scraped data if the request is successful.
485
+ ScrapeResponse with:
486
+ * Requested content formats
487
+ * Page metadata
488
+ * Extraction results
489
+ * Success/error status
151
490
 
152
491
  Raises:
153
- Exception: If the scrape request fails.
492
+ Exception: If scraping fails
154
493
  """
155
-
156
494
  headers = self._prepare_headers()
157
495
 
158
- # Prepare the base scrape parameters with the URL
159
- scrape_params = {'url': url}
160
-
161
- # If there are additional params, process them
162
- if params:
163
- # Handle extract (for v1)
164
- extract = params.get('extract', {})
165
- if extract:
166
- if 'schema' in extract and hasattr(extract['schema'], 'schema'):
167
- extract['schema'] = extract['schema'].schema()
168
- scrape_params['extract'] = extract
169
-
170
- # Include any other params directly at the top level of scrape_params
171
- for key, value in params.items():
172
- if key not in ['extract']:
173
- scrape_params[key] = value
174
-
175
- json = params.get("jsonOptions", {})
176
- if json:
177
- if 'schema' in json and hasattr(json['schema'], 'schema'):
178
- json['schema'] = json['schema'].schema()
179
- scrape_params['jsonOptions'] = json
180
-
181
- change_tracking = params.get("changeTrackingOptions", {})
182
- if change_tracking:
183
- scrape_params['changeTrackingOptions'] = change_tracking
184
-
185
- # Include any other params directly at the top level of scrape_params
186
- for key, value in params.items():
187
- if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']:
188
- scrape_params[key] = value
189
-
190
- agent = params.get('agent')
191
- if agent:
192
- scrape_params['agent'] = agent
193
-
496
+ # Build scrape parameters
497
+ scrape_params = {
498
+ 'url': url,
499
+ 'origin': f"python-sdk@{version}"
500
+ }
194
501
 
195
- endpoint = f'/v1/scrape'
196
- # Make the POST request with the prepared headers and JSON data
502
+ # Add optional parameters if provided
503
+ if formats:
504
+ scrape_params['formats'] = formats
505
+ if include_tags:
506
+ scrape_params['includeTags'] = include_tags
507
+ if exclude_tags:
508
+ scrape_params['excludeTags'] = exclude_tags
509
+ if only_main_content is not None:
510
+ scrape_params['onlyMainContent'] = only_main_content
511
+ if wait_for:
512
+ scrape_params['waitFor'] = wait_for
513
+ if timeout:
514
+ scrape_params['timeout'] = timeout
515
+ if location:
516
+ scrape_params['location'] = location.dict(exclude_none=True)
517
+ if mobile is not None:
518
+ scrape_params['mobile'] = mobile
519
+ if skip_tls_verification is not None:
520
+ scrape_params['skipTlsVerification'] = skip_tls_verification
521
+ if remove_base64_images is not None:
522
+ scrape_params['removeBase64Images'] = remove_base64_images
523
+ if block_ads is not None:
524
+ scrape_params['blockAds'] = block_ads
525
+ if proxy:
526
+ scrape_params['proxy'] = proxy
527
+ if extract:
528
+ if hasattr(extract.schema, 'schema'):
529
+ extract.schema = extract.schema.schema()
530
+ scrape_params['extract'] = extract.dict(exclude_none=True)
531
+ if json_options:
532
+ if hasattr(json_options.schema, 'schema'):
533
+ json_options.schema = json_options.schema.schema()
534
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
535
+ if actions:
536
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
537
+ scrape_params.update(kwargs)
538
+
539
+ # Make request
197
540
  response = requests.post(
198
- f'{self.api_url}{endpoint}',
541
+ f'{self.api_url}/v1/scrape',
199
542
  headers=headers,
200
543
  json=scrape_params,
201
- timeout=(scrape_params["timeout"] + 5000 if "timeout" in scrape_params else None),
544
+ timeout=(timeout + 5000 if timeout else None)
202
545
  )
546
+
203
547
  if response.status_code == 200:
204
548
  try:
205
- response = response.json()
206
- except:
207
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
208
- if response['success'] and 'data' in response:
209
- return response['data']
210
- elif "error" in response:
211
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
212
- else:
213
- raise Exception(f'Failed to scrape URL. Error: {response}')
549
+ response_json = response.json()
550
+ if response_json.get('success') and 'data' in response_json:
551
+ return ScrapeResponse(**response_json['data'])
552
+ elif "error" in response_json:
553
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
554
+ else:
555
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
556
+ except ValueError:
557
+ raise Exception('Failed to parse Firecrawl response as JSON.')
214
558
  else:
215
559
  self._handle_error(response, 'scrape URL')
216
560
 
217
- def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
561
+ def search(
562
+ self,
563
+ query: str,
564
+ *,
565
+ limit: Optional[int] = None,
566
+ tbs: Optional[str] = None,
567
+ filter: Optional[str] = None,
568
+ lang: Optional[str] = None,
569
+ country: Optional[str] = None,
570
+ location: Optional[str] = None,
571
+ timeout: Optional[int] = None,
572
+ scrape_options: Optional[CommonOptions] = None,
573
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
574
+ **kwargs) -> SearchResponse:
218
575
  """
219
- Search for content using the Firecrawl API.
576
+ Search for content using Firecrawl.
220
577
 
221
578
  Args:
222
- query (str): The search query string.
223
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
579
+ query (str): Search query string
580
+ limit (Optional[int]): Max results (default: 5)
581
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
582
+ filter (Optional[str]): Custom result filter
583
+ lang (Optional[str]): Language code (default: "en")
584
+ country (Optional[str]): Country code (default: "us")
585
+ location (Optional[str]): Geo-targeting
586
+ timeout (Optional[int]): Request timeout in milliseconds
587
+ scrape_options (Optional[CommonOptions]): Result scraping configuration
588
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
589
+ **kwargs: Additional keyword arguments for future compatibility
224
590
 
225
591
  Returns:
226
- Dict[str, Any]: The search response containing success status and search results.
592
+ SearchResponse: Response containing:
593
+ * success (bool): Whether request succeeded
594
+ * data (List[FirecrawlDocument]): Search results
595
+ * warning (Optional[str]): Warning message if any
596
+ * error (Optional[str]): Error message if any
597
+
598
+ Raises:
599
+ Exception: If search fails or response cannot be parsed
227
600
  """
228
- if params is None:
229
- params = {}
601
+ # Build search parameters
602
+ search_params = {}
603
+ if params:
604
+ if isinstance(params, dict):
605
+ search_params.update(params)
606
+ else:
607
+ search_params.update(params.dict(exclude_none=True))
608
+
609
+ # Add individual parameters
610
+ if limit is not None:
611
+ search_params['limit'] = limit
612
+ if tbs is not None:
613
+ search_params['tbs'] = tbs
614
+ if filter is not None:
615
+ search_params['filter'] = filter
616
+ if lang is not None:
617
+ search_params['lang'] = lang
618
+ if country is not None:
619
+ search_params['country'] = country
620
+ if location is not None:
621
+ search_params['location'] = location
622
+ if timeout is not None:
623
+ search_params['timeout'] = timeout
624
+ if scrape_options is not None:
625
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
626
+
627
+ # Add any additional kwargs
628
+ search_params.update(kwargs)
230
629
 
231
- if isinstance(params, dict):
232
- search_params = SearchParams(query=query, **params)
233
- else:
234
- search_params = params
235
- search_params.query = query
630
+ # Create final params object
631
+ final_params = SearchParams(query=query, **search_params)
632
+ params_dict = final_params.dict(exclude_none=True)
633
+ params_dict['origin'] = f"python-sdk@{version}"
236
634
 
635
+ # Make request
237
636
  response = requests.post(
238
637
  f"{self.api_url}/v1/search",
239
638
  headers={"Authorization": f"Bearer {self.api_key}"},
240
- json=search_params.dict(exclude_none=True)
639
+ json=params_dict
241
640
  )
242
641
 
243
- if response.status_code != 200:
244
- raise Exception(f"Request failed with status code {response.status_code}")
245
-
246
- try:
247
- return response.json()
248
- except:
249
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
250
-
251
- def crawl_url(self, url: str,
252
- params: Optional[Dict[str, Any]] = None,
253
- poll_interval: Optional[int] = 2,
254
- idempotency_key: Optional[str] = None) -> Any:
642
+ if response.status_code == 200:
643
+ try:
644
+ response_json = response.json()
645
+ if response_json.get('success') and 'data' in response_json:
646
+ return SearchResponse(**response_json)
647
+ elif "error" in response_json:
648
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
649
+ else:
650
+ raise Exception(f'Search failed. Error: {response_json}')
651
+ except ValueError:
652
+ raise Exception('Failed to parse Firecrawl response as JSON.')
653
+ else:
654
+ self._handle_error(response, 'search')
655
+
656
+ def crawl_url(
657
+ self,
658
+ url: str,
659
+ *,
660
+ include_paths: Optional[List[str]] = None,
661
+ exclude_paths: Optional[List[str]] = None,
662
+ max_depth: Optional[int] = None,
663
+ max_discovery_depth: Optional[int] = None,
664
+ limit: Optional[int] = None,
665
+ allow_backward_links: Optional[bool] = None,
666
+ allow_external_links: Optional[bool] = None,
667
+ ignore_sitemap: Optional[bool] = None,
668
+ scrape_options: Optional[CommonOptions] = None,
669
+ webhook: Optional[Union[str, WebhookConfig]] = None,
670
+ deduplicate_similar_urls: Optional[bool] = None,
671
+ ignore_query_parameters: Optional[bool] = None,
672
+ regex_on_full_url: Optional[bool] = None,
673
+ poll_interval: Optional[int] = 2,
674
+ idempotency_key: Optional[str] = None,
675
+ **kwargs
676
+ ) -> CrawlStatusResponse:
255
677
  """
256
- Initiate a crawl job for the specified URL using the Firecrawl API.
678
+ Crawl a website starting from a URL.
257
679
 
258
680
  Args:
259
- url (str): The URL to crawl.
260
- params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
261
- poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
262
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
681
+ url (str): Target URL to start crawling from
682
+ include_paths (Optional[List[str]]): Patterns of URLs to include
683
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
684
+ max_depth (Optional[int]): Maximum crawl depth
685
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
686
+ limit (Optional[int]): Maximum pages to crawl
687
+ allow_backward_links (Optional[bool]): Follow parent directory links
688
+ allow_external_links (Optional[bool]): Follow external domain links
689
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
690
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
691
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
692
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
693
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
694
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
695
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
696
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
697
+ **kwargs: Additional parameters to pass to the API
263
698
 
264
699
  Returns:
265
- Dict[str, Any]: A dictionary containing the crawl results. The structure includes:
266
- - 'success' (bool): Indicates if the crawl was successful.
267
- - 'status' (str): The final status of the crawl job (e.g., 'completed').
268
- - 'completed' (int): Number of scraped pages that completed.
269
- - 'total' (int): Total number of scraped pages.
270
- - 'creditsUsed' (int): Estimated number of API credits used for this crawl.
271
- - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires.
272
- - 'data' (List[Dict]): List of all the scraped pages.
700
+ CrawlStatusResponse with:
701
+ * Crawling status and progress
702
+ * Crawled page contents
703
+ * Success/error information
273
704
 
274
705
  Raises:
275
- Exception: If the crawl job initiation or monitoring fails.
706
+ Exception: If crawl fails
276
707
  """
277
- endpoint = f'/v1/crawl'
708
+ crawl_params = {}
709
+
710
+ # Add individual parameters
711
+ if include_paths is not None:
712
+ crawl_params['includePaths'] = include_paths
713
+ if exclude_paths is not None:
714
+ crawl_params['excludePaths'] = exclude_paths
715
+ if max_depth is not None:
716
+ crawl_params['maxDepth'] = max_depth
717
+ if max_discovery_depth is not None:
718
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
719
+ if limit is not None:
720
+ crawl_params['limit'] = limit
721
+ if allow_backward_links is not None:
722
+ crawl_params['allowBackwardLinks'] = allow_backward_links
723
+ if allow_external_links is not None:
724
+ crawl_params['allowExternalLinks'] = allow_external_links
725
+ if ignore_sitemap is not None:
726
+ crawl_params['ignoreSitemap'] = ignore_sitemap
727
+ if scrape_options is not None:
728
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
729
+ if webhook is not None:
730
+ crawl_params['webhook'] = webhook
731
+ if deduplicate_similar_urls is not None:
732
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
733
+ if ignore_query_parameters is not None:
734
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
735
+ if regex_on_full_url is not None:
736
+ crawl_params['regexOnFullURL'] = regex_on_full_url
737
+
738
+ # Add any additional kwargs
739
+ crawl_params.update(kwargs)
740
+
741
+ # Create final params object
742
+ final_params = CrawlParams(**crawl_params)
743
+ params_dict = final_params.dict(exclude_none=True)
744
+ params_dict['url'] = url
745
+ params_dict['origin'] = f"python-sdk@{version}"
746
+
747
+ # Make request
278
748
  headers = self._prepare_headers(idempotency_key)
279
- json_data = {'url': url}
280
- if params:
281
- json_data.update(params)
282
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
749
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
750
+
283
751
  if response.status_code == 200:
284
752
  try:
285
753
  id = response.json().get('id')
286
754
  except:
287
755
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
288
756
  return self._monitor_job_status(id, headers, poll_interval)
289
-
290
757
  else:
291
758
  self._handle_error(response, 'start crawl job')
292
759
 
293
-
294
- def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
760
+ def async_crawl_url(
761
+ self,
762
+ url: str,
763
+ *,
764
+ include_paths: Optional[List[str]] = None,
765
+ exclude_paths: Optional[List[str]] = None,
766
+ max_depth: Optional[int] = None,
767
+ max_discovery_depth: Optional[int] = None,
768
+ limit: Optional[int] = None,
769
+ allow_backward_links: Optional[bool] = None,
770
+ allow_external_links: Optional[bool] = None,
771
+ ignore_sitemap: Optional[bool] = None,
772
+ scrape_options: Optional[CommonOptions] = None,
773
+ webhook: Optional[Union[str, WebhookConfig]] = None,
774
+ deduplicate_similar_urls: Optional[bool] = None,
775
+ ignore_query_parameters: Optional[bool] = None,
776
+ regex_on_full_url: Optional[bool] = None,
777
+ idempotency_key: Optional[str] = None,
778
+ **kwargs
779
+ ) -> CrawlResponse:
295
780
  """
296
- Initiate a crawl job asynchronously.
781
+ Start an asynchronous crawl job.
297
782
 
298
783
  Args:
299
- url (str): The URL to crawl.
300
- params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
301
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
784
+ url (str): Target URL to start crawling from
785
+ include_paths (Optional[List[str]]): Patterns of URLs to include
786
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
787
+ max_depth (Optional[int]): Maximum crawl depth
788
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
789
+ limit (Optional[int]): Maximum pages to crawl
790
+ allow_backward_links (Optional[bool]): Follow parent directory links
791
+ allow_external_links (Optional[bool]): Follow external domain links
792
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
793
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
794
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
795
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
796
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
797
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
798
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
799
+ **kwargs: Additional parameters to pass to the API
302
800
 
303
801
  Returns:
304
- Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes:
305
- - 'success' (bool): Indicates if the crawl initiation was successful.
306
- - 'id' (str): The unique identifier for the crawl job.
307
- - 'url' (str): The URL to check the status of the crawl job.
802
+ CrawlResponse with:
803
+ * success - Whether crawl started successfully
804
+ * id - Unique identifier for the crawl job
805
+ * url - Status check URL for the crawl
806
+ * error - Error message if start failed
807
+
808
+ Raises:
809
+ Exception: If crawl initiation fails
308
810
  """
309
- endpoint = f'/v1/crawl'
811
+ crawl_params = {}
812
+
813
+ # Add individual parameters
814
+ if include_paths is not None:
815
+ crawl_params['includePaths'] = include_paths
816
+ if exclude_paths is not None:
817
+ crawl_params['excludePaths'] = exclude_paths
818
+ if max_depth is not None:
819
+ crawl_params['maxDepth'] = max_depth
820
+ if max_discovery_depth is not None:
821
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
822
+ if limit is not None:
823
+ crawl_params['limit'] = limit
824
+ if allow_backward_links is not None:
825
+ crawl_params['allowBackwardLinks'] = allow_backward_links
826
+ if allow_external_links is not None:
827
+ crawl_params['allowExternalLinks'] = allow_external_links
828
+ if ignore_sitemap is not None:
829
+ crawl_params['ignoreSitemap'] = ignore_sitemap
830
+ if scrape_options is not None:
831
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
832
+ if webhook is not None:
833
+ crawl_params['webhook'] = webhook
834
+ if deduplicate_similar_urls is not None:
835
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
836
+ if ignore_query_parameters is not None:
837
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
838
+ if regex_on_full_url is not None:
839
+ crawl_params['regexOnFullURL'] = regex_on_full_url
840
+
841
+ # Add any additional kwargs
842
+ crawl_params.update(kwargs)
843
+
844
+ # Create final params object
845
+ final_params = CrawlParams(**crawl_params)
846
+ params_dict = final_params.dict(exclude_none=True)
847
+ params_dict['url'] = url
848
+ params_dict['origin'] = f"python-sdk@{version}"
849
+
850
+ # Make request
310
851
  headers = self._prepare_headers(idempotency_key)
311
- json_data = {'url': url}
312
- if params:
313
- json_data.update(params)
314
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
852
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
853
+
315
854
  if response.status_code == 200:
316
855
  try:
317
- return response.json()
856
+ return CrawlResponse(**response.json())
318
857
  except:
319
858
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
320
859
  else:
321
860
  self._handle_error(response, 'start crawl job')
322
861
 
323
- def check_crawl_status(self, id: str) -> Any:
862
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
324
863
  """
325
- Check the status of a crawl job using the Firecrawl API.
864
+ Check the status and results of a crawl job.
326
865
 
327
866
  Args:
328
- id (str): The ID of the crawl job.
867
+ id: Unique identifier for the crawl job
329
868
 
330
869
  Returns:
331
- Any: The status of the crawl job.
870
+ CrawlStatusResponse containing:
871
+
872
+ Status Information:
873
+ * status - Current state (scraping/completed/failed/cancelled)
874
+ * completed - Number of pages crawled
875
+ * total - Total pages to crawl
876
+ * creditsUsed - API credits consumed
877
+ * expiresAt - Data expiration timestamp
878
+
879
+ Results:
880
+ * data - List of crawled documents
881
+ * next - URL for next page of results (if paginated)
882
+ * success - Whether status check succeeded
883
+ * error - Error message if failed
332
884
 
333
885
  Raises:
334
- Exception: If the status check request fails.
886
+ Exception: If status check fails
335
887
  """
336
888
  endpoint = f'/v1/crawl/{id}'
337
889
 
@@ -383,28 +935,37 @@ class FirecrawlApp:
383
935
  if 'next' in status_data:
384
936
  response['next'] = status_data['next']
385
937
 
386
- return {
387
- 'success': False if 'error' in status_data else True,
938
+ return CrawlStatusResponse(
939
+ success=False if 'error' in status_data else True,
388
940
  **response
389
- }
941
+ )
390
942
  else:
391
943
  self._handle_error(response, 'check crawl status')
392
944
 
393
- def check_crawl_errors(self, id: str) -> Dict[str, Any]:
945
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
394
946
  """
395
947
  Returns information about crawl errors.
396
948
 
397
949
  Args:
398
- id (str): The ID of the crawl job.
950
+ id (str): The ID of the crawl job
399
951
 
400
952
  Returns:
401
- Dict[str, Any]: Information about crawl errors.
953
+ CrawlErrorsResponse containing:
954
+ * errors (List[Dict[str, str]]): List of errors with fields:
955
+ - id (str): Error ID
956
+ - timestamp (str): When the error occurred
957
+ - url (str): URL that caused the error
958
+ - error (str): Error message
959
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
960
+
961
+ Raises:
962
+ Exception: If error check fails
402
963
  """
403
964
  headers = self._prepare_headers()
404
965
  response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
405
966
  if response.status_code == 200:
406
967
  try:
407
- return response.json()
968
+ return CrawlErrorsResponse(**response.json())
408
969
  except:
409
970
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
410
971
  else:
@@ -412,13 +973,18 @@ class FirecrawlApp:
412
973
 
413
974
  def cancel_crawl(self, id: str) -> Dict[str, Any]:
414
975
  """
415
- Cancel an asynchronous crawl job using the Firecrawl API.
976
+ Cancel an asynchronous crawl job.
416
977
 
417
978
  Args:
418
- id (str): The ID of the crawl job to cancel.
979
+ id (str): The ID of the crawl job to cancel
419
980
 
420
981
  Returns:
421
- Dict[str, Any]: The response from the cancel crawl request.
982
+ Dict[str, Any] containing:
983
+ * success (bool): Whether cancellation was successful
984
+ * error (str, optional): Error message if cancellation failed
985
+
986
+ Raises:
987
+ Exception: If cancellation fails
422
988
  """
423
989
  headers = self._prepare_headers()
424
990
  response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
@@ -430,154 +996,524 @@ class FirecrawlApp:
430
996
  else:
431
997
  self._handle_error(response, "cancel crawl job")
432
998
 
433
- def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
999
+ def crawl_url_and_watch(
1000
+ self,
1001
+ url: str,
1002
+ *,
1003
+ include_paths: Optional[List[str]] = None,
1004
+ exclude_paths: Optional[List[str]] = None,
1005
+ max_depth: Optional[int] = None,
1006
+ max_discovery_depth: Optional[int] = None,
1007
+ limit: Optional[int] = None,
1008
+ allow_backward_links: Optional[bool] = None,
1009
+ allow_external_links: Optional[bool] = None,
1010
+ ignore_sitemap: Optional[bool] = None,
1011
+ scrape_options: Optional[CommonOptions] = None,
1012
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1013
+ deduplicate_similar_urls: Optional[bool] = None,
1014
+ ignore_query_parameters: Optional[bool] = None,
1015
+ regex_on_full_url: Optional[bool] = None,
1016
+ idempotency_key: Optional[str] = None,
1017
+ **kwargs
1018
+ ) -> 'CrawlWatcher':
434
1019
  """
435
1020
  Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
436
1021
 
437
1022
  Args:
438
- url (str): The URL to crawl.
439
- params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
440
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1023
+ url (str): Target URL to start crawling from
1024
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1025
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1026
+ max_depth (Optional[int]): Maximum crawl depth
1027
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1028
+ limit (Optional[int]): Maximum pages to crawl
1029
+ allow_backward_links (Optional[bool]): Follow parent directory links
1030
+ allow_external_links (Optional[bool]): Follow external domain links
1031
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1032
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
1033
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1034
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1035
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1036
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1037
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1038
+ **kwargs: Additional parameters to pass to the API
441
1039
 
442
1040
  Returns:
443
- CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job.
1041
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1042
+
1043
+ Raises:
1044
+ Exception: If crawl job fails to start
444
1045
  """
445
- crawl_response = self.async_crawl_url(url, params, idempotency_key)
446
- if crawl_response['success'] and 'id' in crawl_response:
447
- return CrawlWatcher(crawl_response['id'], self)
1046
+ crawl_response = self.async_crawl_url(
1047
+ url,
1048
+ include_paths=include_paths,
1049
+ exclude_paths=exclude_paths,
1050
+ max_depth=max_depth,
1051
+ max_discovery_depth=max_discovery_depth,
1052
+ limit=limit,
1053
+ allow_backward_links=allow_backward_links,
1054
+ allow_external_links=allow_external_links,
1055
+ ignore_sitemap=ignore_sitemap,
1056
+ scrape_options=scrape_options,
1057
+ webhook=webhook,
1058
+ deduplicate_similar_urls=deduplicate_similar_urls,
1059
+ ignore_query_parameters=ignore_query_parameters,
1060
+ regex_on_full_url=regex_on_full_url,
1061
+ idempotency_key=idempotency_key,
1062
+ **kwargs
1063
+ )
1064
+ if crawl_response.success and crawl_response.id:
1065
+ return CrawlWatcher(crawl_response.id, self)
448
1066
  else:
449
1067
  raise Exception("Crawl job failed to start")
450
1068
 
451
- def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
1069
+ def map_url(
1070
+ self,
1071
+ url: str,
1072
+ *,
1073
+ search: Optional[str] = None,
1074
+ ignore_sitemap: Optional[bool] = None,
1075
+ include_subdomains: Optional[bool] = None,
1076
+ sitemap_only: Optional[bool] = None,
1077
+ limit: Optional[int] = None,
1078
+ timeout: Optional[int] = None,
1079
+ params: Optional[MapParams] = None) -> MapResponse:
452
1080
  """
453
- Perform a map search using the Firecrawl API.
1081
+ Map and discover links from a URL.
454
1082
 
455
1083
  Args:
456
- url (str): The URL to perform the map search on.
457
- params (Optional[Dict[str, Any]]): Additional parameters for the map search.
1084
+ url (str): Target URL to map
1085
+ search (Optional[str]): Filter pattern for URLs
1086
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1087
+ include_subdomains (Optional[bool]): Include subdomain links
1088
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1089
+ limit (Optional[int]): Maximum URLs to return
1090
+ timeout (Optional[int]): Request timeout in milliseconds
1091
+ params (Optional[MapParams]): Additional mapping parameters
458
1092
 
459
1093
  Returns:
460
- List[str]: A list of URLs discovered during the map search.
461
- """
462
- endpoint = f'/v1/map'
463
- headers = self._prepare_headers()
1094
+ MapResponse: Response containing:
1095
+ * success (bool): Whether request succeeded
1096
+ * links (List[str]): Discovered URLs
1097
+ * error (Optional[str]): Error message if any
464
1098
 
465
- # Prepare the base scrape parameters with the URL
466
- json_data = {'url': url}
1099
+ Raises:
1100
+ Exception: If mapping fails or response cannot be parsed
1101
+ """
1102
+ # Build map parameters
1103
+ map_params = {}
467
1104
  if params:
468
- json_data.update(params)
469
-
470
- # Make the POST request with the prepared headers and JSON data
1105
+ map_params.update(params.dict(exclude_none=True))
1106
+
1107
+ # Add individual parameters
1108
+ if search is not None:
1109
+ map_params['search'] = search
1110
+ if ignore_sitemap is not None:
1111
+ map_params['ignoreSitemap'] = ignore_sitemap
1112
+ if include_subdomains is not None:
1113
+ map_params['includeSubdomains'] = include_subdomains
1114
+ if sitemap_only is not None:
1115
+ map_params['sitemapOnly'] = sitemap_only
1116
+ if limit is not None:
1117
+ map_params['limit'] = limit
1118
+ if timeout is not None:
1119
+ map_params['timeout'] = timeout
1120
+
1121
+ # Create final params object
1122
+ final_params = MapParams(**map_params)
1123
+ params_dict = final_params.dict(exclude_none=True)
1124
+ params_dict['url'] = url
1125
+ params_dict['origin'] = f"python-sdk@{version}"
1126
+
1127
+ # Make request
471
1128
  response = requests.post(
472
- f'{self.api_url}{endpoint}',
473
- headers=headers,
474
- json=json_data,
1129
+ f"{self.api_url}/v1/map",
1130
+ headers={"Authorization": f"Bearer {self.api_key}"},
1131
+ json=params_dict
475
1132
  )
1133
+
476
1134
  if response.status_code == 200:
477
1135
  try:
478
- response = response.json()
479
- except:
480
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
481
- if response['success'] and 'links' in response:
482
- return response
483
- elif 'error' in response:
484
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
485
- else:
486
- raise Exception(f'Failed to map URL. Error: {response}')
1136
+ response_json = response.json()
1137
+ if response_json.get('success') and 'links' in response_json:
1138
+ return MapResponse(**response_json)
1139
+ elif "error" in response_json:
1140
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1141
+ else:
1142
+ raise Exception(f'Map failed. Error: {response_json}')
1143
+ except ValueError:
1144
+ raise Exception('Failed to parse Firecrawl response as JSON.')
487
1145
  else:
488
1146
  self._handle_error(response, 'map')
489
1147
 
490
- def batch_scrape_urls(self, urls: List[str],
491
- params: Optional[Dict[str, Any]] = None,
492
- poll_interval: Optional[int] = 2,
493
- idempotency_key: Optional[str] = None) -> Any:
1148
+ def batch_scrape_urls(
1149
+ self,
1150
+ urls: List[str],
1151
+ *,
1152
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1153
+ headers: Optional[Dict[str, str]] = None,
1154
+ include_tags: Optional[List[str]] = None,
1155
+ exclude_tags: Optional[List[str]] = None,
1156
+ only_main_content: Optional[bool] = None,
1157
+ wait_for: Optional[int] = None,
1158
+ timeout: Optional[int] = None,
1159
+ location: Optional[LocationConfig] = None,
1160
+ mobile: Optional[bool] = None,
1161
+ skip_tls_verification: Optional[bool] = None,
1162
+ remove_base64_images: Optional[bool] = None,
1163
+ block_ads: Optional[bool] = None,
1164
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1165
+ extract: Optional[ExtractConfig] = None,
1166
+ json_options: Optional[ExtractConfig] = None,
1167
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1168
+ agent: Optional[AgentOptions] = None,
1169
+ poll_interval: Optional[int] = 2,
1170
+ idempotency_key: Optional[str] = None,
1171
+ **kwargs
1172
+ ) -> BatchScrapeStatusResponse:
494
1173
  """
495
- Initiate a batch scrape job for the specified URLs using the Firecrawl API.
1174
+ Batch scrape multiple URLs and monitor until completion.
496
1175
 
497
1176
  Args:
498
- urls (List[str]): The URLs to scrape.
499
- params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
500
- poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
501
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1177
+ urls (List[str]): URLs to scrape
1178
+ formats (Optional[List[Literal]]): Content formats to retrieve
1179
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1180
+ include_tags (Optional[List[str]]): HTML tags to include
1181
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1182
+ only_main_content (Optional[bool]): Extract main content only
1183
+ wait_for (Optional[int]): Wait time in milliseconds
1184
+ timeout (Optional[int]): Request timeout in milliseconds
1185
+ location (Optional[LocationConfig]): Location configuration
1186
+ mobile (Optional[bool]): Use mobile user agent
1187
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1188
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1189
+ block_ads (Optional[bool]): Block advertisements
1190
+ proxy (Optional[Literal]): Proxy type to use
1191
+ extract (Optional[ExtractConfig]): Content extraction config
1192
+ json_options (Optional[ExtractConfig]): JSON extraction config
1193
+ actions (Optional[List[Union]]): Actions to perform
1194
+ agent (Optional[AgentOptions]): Agent configuration
1195
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1196
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1197
+ **kwargs: Additional parameters to pass to the API
502
1198
 
503
1199
  Returns:
504
- Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
505
- - 'success' (bool): Indicates if the batch scrape was successful.
506
- - 'status' (str): The final status of the batch scrape job (e.g., 'completed').
507
- - 'completed' (int): Number of scraped pages that completed.
508
- - 'total' (int): Total number of scraped pages.
509
- - 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
510
- - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
511
- - 'data' (List[Dict]): List of all the scraped pages.
1200
+ BatchScrapeStatusResponse with:
1201
+ * Scraping status and progress
1202
+ * Scraped content for each URL
1203
+ * Success/error information
512
1204
 
513
1205
  Raises:
514
- Exception: If the batch scrape job initiation or monitoring fails.
1206
+ Exception: If batch scrape fails
515
1207
  """
516
- endpoint = f'/v1/batch/scrape'
1208
+ scrape_params = {}
1209
+
1210
+ # Add individual parameters
1211
+ if formats is not None:
1212
+ scrape_params['formats'] = formats
1213
+ if headers is not None:
1214
+ scrape_params['headers'] = headers
1215
+ if include_tags is not None:
1216
+ scrape_params['includeTags'] = include_tags
1217
+ if exclude_tags is not None:
1218
+ scrape_params['excludeTags'] = exclude_tags
1219
+ if only_main_content is not None:
1220
+ scrape_params['onlyMainContent'] = only_main_content
1221
+ if wait_for is not None:
1222
+ scrape_params['waitFor'] = wait_for
1223
+ if timeout is not None:
1224
+ scrape_params['timeout'] = timeout
1225
+ if location is not None:
1226
+ scrape_params['location'] = location.dict(exclude_none=True)
1227
+ if mobile is not None:
1228
+ scrape_params['mobile'] = mobile
1229
+ if skip_tls_verification is not None:
1230
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1231
+ if remove_base64_images is not None:
1232
+ scrape_params['removeBase64Images'] = remove_base64_images
1233
+ if block_ads is not None:
1234
+ scrape_params['blockAds'] = block_ads
1235
+ if proxy is not None:
1236
+ scrape_params['proxy'] = proxy
1237
+ if extract is not None:
1238
+ if hasattr(extract.schema, 'schema'):
1239
+ extract.schema = extract.schema.schema()
1240
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1241
+ if json_options is not None:
1242
+ if hasattr(json_options.schema, 'schema'):
1243
+ json_options.schema = json_options.schema.schema()
1244
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1245
+ if actions is not None:
1246
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1247
+ if agent is not None:
1248
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1249
+
1250
+ # Add any additional kwargs
1251
+ scrape_params.update(kwargs)
1252
+
1253
+ # Create final params object
1254
+ final_params = ScrapeParams(**scrape_params)
1255
+ params_dict = final_params.dict(exclude_none=True)
1256
+ params_dict['urls'] = urls
1257
+ params_dict['origin'] = f"python-sdk@{version}"
1258
+
1259
+ # Make request
517
1260
  headers = self._prepare_headers(idempotency_key)
518
- json_data = {'urls': urls}
519
- if params:
520
- json_data.update(params)
521
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
1261
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1262
+
522
1263
  if response.status_code == 200:
523
1264
  try:
524
1265
  id = response.json().get('id')
525
1266
  except:
526
1267
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
527
1268
  return self._monitor_job_status(id, headers, poll_interval)
528
-
529
1269
  else:
530
1270
  self._handle_error(response, 'start batch scrape job')
531
1271
 
532
-
533
- def async_batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
1272
+ def async_batch_scrape_urls(
1273
+ self,
1274
+ urls: List[str],
1275
+ *,
1276
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1277
+ headers: Optional[Dict[str, str]] = None,
1278
+ include_tags: Optional[List[str]] = None,
1279
+ exclude_tags: Optional[List[str]] = None,
1280
+ only_main_content: Optional[bool] = None,
1281
+ wait_for: Optional[int] = None,
1282
+ timeout: Optional[int] = None,
1283
+ location: Optional[LocationConfig] = None,
1284
+ mobile: Optional[bool] = None,
1285
+ skip_tls_verification: Optional[bool] = None,
1286
+ remove_base64_images: Optional[bool] = None,
1287
+ block_ads: Optional[bool] = None,
1288
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1289
+ extract: Optional[ExtractConfig] = None,
1290
+ json_options: Optional[ExtractConfig] = None,
1291
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1292
+ agent: Optional[AgentOptions] = None,
1293
+ idempotency_key: Optional[str] = None,
1294
+ **kwargs
1295
+ ) -> BatchScrapeResponse:
534
1296
  """
535
- Initiate a crawl job asynchronously.
1297
+ Initiate a batch scrape job asynchronously.
536
1298
 
537
1299
  Args:
538
- urls (List[str]): The URLs to scrape.
539
- params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
540
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1300
+ urls (List[str]): URLs to scrape
1301
+ formats (Optional[List[Literal]]): Content formats to retrieve
1302
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1303
+ include_tags (Optional[List[str]]): HTML tags to include
1304
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1305
+ only_main_content (Optional[bool]): Extract main content only
1306
+ wait_for (Optional[int]): Wait time in milliseconds
1307
+ timeout (Optional[int]): Request timeout in milliseconds
1308
+ location (Optional[LocationConfig]): Location configuration
1309
+ mobile (Optional[bool]): Use mobile user agent
1310
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1311
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1312
+ block_ads (Optional[bool]): Block advertisements
1313
+ proxy (Optional[Literal]): Proxy type to use
1314
+ extract (Optional[ExtractConfig]): Content extraction config
1315
+ json_options (Optional[ExtractConfig]): JSON extraction config
1316
+ actions (Optional[List[Union]]): Actions to perform
1317
+ agent (Optional[AgentOptions]): Agent configuration
1318
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1319
+ **kwargs: Additional parameters to pass to the API
541
1320
 
542
1321
  Returns:
543
- Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
544
- - 'success' (bool): Indicates if the batch scrape initiation was successful.
545
- - 'id' (str): The unique identifier for the batch scrape job.
546
- - 'url' (str): The URL to check the status of the batch scrape job.
1322
+ BatchScrapeResponse with:
1323
+ * success - Whether job started successfully
1324
+ * id - Unique identifier for the job
1325
+ * url - Status check URL
1326
+ * error - Error message if start failed
1327
+
1328
+ Raises:
1329
+ Exception: If job initiation fails
547
1330
  """
548
- endpoint = f'/v1/batch/scrape'
1331
+ scrape_params = {}
1332
+
1333
+ # Add individual parameters
1334
+ if formats is not None:
1335
+ scrape_params['formats'] = formats
1336
+ if headers is not None:
1337
+ scrape_params['headers'] = headers
1338
+ if include_tags is not None:
1339
+ scrape_params['includeTags'] = include_tags
1340
+ if exclude_tags is not None:
1341
+ scrape_params['excludeTags'] = exclude_tags
1342
+ if only_main_content is not None:
1343
+ scrape_params['onlyMainContent'] = only_main_content
1344
+ if wait_for is not None:
1345
+ scrape_params['waitFor'] = wait_for
1346
+ if timeout is not None:
1347
+ scrape_params['timeout'] = timeout
1348
+ if location is not None:
1349
+ scrape_params['location'] = location.dict(exclude_none=True)
1350
+ if mobile is not None:
1351
+ scrape_params['mobile'] = mobile
1352
+ if skip_tls_verification is not None:
1353
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1354
+ if remove_base64_images is not None:
1355
+ scrape_params['removeBase64Images'] = remove_base64_images
1356
+ if block_ads is not None:
1357
+ scrape_params['blockAds'] = block_ads
1358
+ if proxy is not None:
1359
+ scrape_params['proxy'] = proxy
1360
+ if extract is not None:
1361
+ if hasattr(extract.schema, 'schema'):
1362
+ extract.schema = extract.schema.schema()
1363
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1364
+ if json_options is not None:
1365
+ if hasattr(json_options.schema, 'schema'):
1366
+ json_options.schema = json_options.schema.schema()
1367
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1368
+ if actions is not None:
1369
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1370
+ if agent is not None:
1371
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1372
+
1373
+ # Add any additional kwargs
1374
+ scrape_params.update(kwargs)
1375
+
1376
+ # Create final params object
1377
+ final_params = ScrapeParams(**scrape_params)
1378
+ params_dict = final_params.dict(exclude_none=True)
1379
+ params_dict['urls'] = urls
1380
+ params_dict['origin'] = f"python-sdk@{version}"
1381
+
1382
+ # Make request
549
1383
  headers = self._prepare_headers(idempotency_key)
550
- json_data = {'urls': urls}
551
- if params:
552
- json_data.update(params)
553
- response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
1384
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1385
+
554
1386
  if response.status_code == 200:
555
1387
  try:
556
- return response.json()
1388
+ return BatchScrapeResponse(**response.json())
557
1389
  except:
558
1390
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
559
1391
  else:
560
1392
  self._handle_error(response, 'start batch scrape job')
561
1393
 
562
- def batch_scrape_urls_and_watch(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
1394
+ def batch_scrape_urls_and_watch(
1395
+ self,
1396
+ urls: List[str],
1397
+ *,
1398
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1399
+ headers: Optional[Dict[str, str]] = None,
1400
+ include_tags: Optional[List[str]] = None,
1401
+ exclude_tags: Optional[List[str]] = None,
1402
+ only_main_content: Optional[bool] = None,
1403
+ wait_for: Optional[int] = None,
1404
+ timeout: Optional[int] = None,
1405
+ location: Optional[LocationConfig] = None,
1406
+ mobile: Optional[bool] = None,
1407
+ skip_tls_verification: Optional[bool] = None,
1408
+ remove_base64_images: Optional[bool] = None,
1409
+ block_ads: Optional[bool] = None,
1410
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1411
+ extract: Optional[ExtractConfig] = None,
1412
+ json_options: Optional[ExtractConfig] = None,
1413
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1414
+ agent: Optional[AgentOptions] = None,
1415
+ idempotency_key: Optional[str] = None,
1416
+ **kwargs
1417
+ ) -> 'CrawlWatcher':
563
1418
  """
564
1419
  Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
565
1420
 
566
1421
  Args:
567
- urls (List[str]): The URLs to scrape.
568
- params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
569
- idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
1422
+ urls (List[str]): URLs to scrape
1423
+ formats (Optional[List[Literal]]): Content formats to retrieve
1424
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1425
+ include_tags (Optional[List[str]]): HTML tags to include
1426
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1427
+ only_main_content (Optional[bool]): Extract main content only
1428
+ wait_for (Optional[int]): Wait time in milliseconds
1429
+ timeout (Optional[int]): Request timeout in milliseconds
1430
+ location (Optional[LocationConfig]): Location configuration
1431
+ mobile (Optional[bool]): Use mobile user agent
1432
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1433
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1434
+ block_ads (Optional[bool]): Block advertisements
1435
+ proxy (Optional[Literal]): Proxy type to use
1436
+ extract (Optional[ExtractConfig]): Content extraction config
1437
+ json_options (Optional[ExtractConfig]): JSON extraction config
1438
+ actions (Optional[List[Union]]): Actions to perform
1439
+ agent (Optional[AgentOptions]): Agent configuration
1440
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1441
+ **kwargs: Additional parameters to pass to the API
570
1442
 
571
1443
  Returns:
572
- CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
1444
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1445
+
1446
+ Raises:
1447
+ Exception: If batch scrape job fails to start
573
1448
  """
574
- crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
575
- if crawl_response['success'] and 'id' in crawl_response:
576
- return CrawlWatcher(crawl_response['id'], self)
1449
+ scrape_params = {}
1450
+
1451
+ # Add individual parameters
1452
+ if formats is not None:
1453
+ scrape_params['formats'] = formats
1454
+ if headers is not None:
1455
+ scrape_params['headers'] = headers
1456
+ if include_tags is not None:
1457
+ scrape_params['includeTags'] = include_tags
1458
+ if exclude_tags is not None:
1459
+ scrape_params['excludeTags'] = exclude_tags
1460
+ if only_main_content is not None:
1461
+ scrape_params['onlyMainContent'] = only_main_content
1462
+ if wait_for is not None:
1463
+ scrape_params['waitFor'] = wait_for
1464
+ if timeout is not None:
1465
+ scrape_params['timeout'] = timeout
1466
+ if location is not None:
1467
+ scrape_params['location'] = location.dict(exclude_none=True)
1468
+ if mobile is not None:
1469
+ scrape_params['mobile'] = mobile
1470
+ if skip_tls_verification is not None:
1471
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1472
+ if remove_base64_images is not None:
1473
+ scrape_params['removeBase64Images'] = remove_base64_images
1474
+ if block_ads is not None:
1475
+ scrape_params['blockAds'] = block_ads
1476
+ if proxy is not None:
1477
+ scrape_params['proxy'] = proxy
1478
+ if extract is not None:
1479
+ if hasattr(extract.schema, 'schema'):
1480
+ extract.schema = extract.schema.schema()
1481
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1482
+ if json_options is not None:
1483
+ if hasattr(json_options.schema, 'schema'):
1484
+ json_options.schema = json_options.schema.schema()
1485
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1486
+ if actions is not None:
1487
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1488
+ if agent is not None:
1489
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1490
+
1491
+ # Add any additional kwargs
1492
+ scrape_params.update(kwargs)
1493
+
1494
+ # Create final params object
1495
+ final_params = ScrapeParams(**scrape_params)
1496
+ params_dict = final_params.dict(exclude_none=True)
1497
+ params_dict['urls'] = urls
1498
+ params_dict['origin'] = f"python-sdk@{version}"
1499
+
1500
+ # Make request
1501
+ headers = self._prepare_headers(idempotency_key)
1502
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1503
+
1504
+ if response.status_code == 200:
1505
+ try:
1506
+ crawl_response = BatchScrapeResponse(**response.json())
1507
+ if crawl_response.success and crawl_response.id:
1508
+ return CrawlWatcher(crawl_response.id, self)
1509
+ else:
1510
+ raise Exception("Batch scrape job failed to start")
1511
+ except:
1512
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
577
1513
  else:
578
- raise Exception("Batch scrape job failed to start")
1514
+ self._handle_error(response, 'start batch scrape job')
579
1515
 
580
- def check_batch_scrape_status(self, id: str) -> Any:
1516
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
581
1517
  """
582
1518
  Check the status of a batch scrape job using the Firecrawl API.
583
1519
 
@@ -585,7 +1521,7 @@ class FirecrawlApp:
585
1521
  id (str): The ID of the batch scrape job.
586
1522
 
587
1523
  Returns:
588
- Any: The status of the batch scrape job.
1524
+ BatchScrapeStatusResponse: The status of the batch scrape job.
589
1525
 
590
1526
  Raises:
591
1527
  Exception: If the status check request fails.
@@ -625,29 +1561,21 @@ class FirecrawlApp:
625
1561
  break
626
1562
  status_data['data'] = data
627
1563
 
628
- response = {
1564
+ return BatchScrapeStatusResponse(**{
1565
+ 'success': False if 'error' in status_data else True,
629
1566
  'status': status_data.get('status'),
630
1567
  'total': status_data.get('total'),
631
1568
  'completed': status_data.get('completed'),
632
1569
  'creditsUsed': status_data.get('creditsUsed'),
633
1570
  'expiresAt': status_data.get('expiresAt'),
634
- 'data': status_data.get('data')
635
- }
636
-
637
- if 'error' in status_data:
638
- response['error'] = status_data['error']
639
-
640
- if 'next' in status_data:
641
- response['next'] = status_data['next']
642
-
643
- return {
644
- 'success': False if 'error' in status_data else True,
645
- **response
646
- }
1571
+ 'data': status_data.get('data'),
1572
+ 'next': status_data.get('next'),
1573
+ 'error': status_data.get('error')
1574
+ })
647
1575
  else:
648
1576
  self._handle_error(response, 'check batch scrape status')
649
1577
 
650
- def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
1578
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
651
1579
  """
652
1580
  Returns information about batch scrape errors.
653
1581
 
@@ -655,38 +1583,68 @@ class FirecrawlApp:
655
1583
  id (str): The ID of the crawl job.
656
1584
 
657
1585
  Returns:
658
- Dict[str, Any]: Information about crawl errors.
1586
+ CrawlErrorsResponse: A response containing:
1587
+ * errors (List[Dict[str, str]]): List of errors with fields:
1588
+ * id (str): Error ID
1589
+ * timestamp (str): When the error occurred
1590
+ * url (str): URL that caused the error
1591
+ * error (str): Error message
1592
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1593
+
1594
+ Raises:
1595
+ Exception: If the error check request fails
659
1596
  """
660
1597
  headers = self._prepare_headers()
661
1598
  response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
662
1599
  if response.status_code == 200:
663
1600
  try:
664
- return response.json()
1601
+ return CrawlErrorsResponse(**response.json())
665
1602
  except:
666
1603
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
667
1604
  else:
668
1605
  self._handle_error(response, "check batch scrape errors")
669
1606
 
670
- def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any:
1607
+ def extract(
1608
+ self,
1609
+ urls: Optional[List[str]] = None,
1610
+ *,
1611
+ prompt: Optional[str] = None,
1612
+ schema: Optional[Any] = None,
1613
+ system_prompt: Optional[str] = None,
1614
+ allow_external_links: Optional[bool] = False,
1615
+ enable_web_search: Optional[bool] = False,
1616
+ show_sources: Optional[bool] = False,
1617
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
671
1618
  """
672
- Extracts information from a URL using the Firecrawl API.
1619
+ Extract structured information from URLs.
673
1620
 
674
1621
  Args:
675
- urls (Optional[List[str]]): The URLs to extract information from.
676
- params (Optional[ExtractParams]): Additional parameters for the extract request.
1622
+ urls (Optional[List[str]]): URLs to extract from
1623
+ prompt (Optional[str]): Custom extraction prompt
1624
+ schema (Optional[Any]): JSON schema/Pydantic model
1625
+ system_prompt (Optional[str]): System context
1626
+ allow_external_links (Optional[bool]): Follow external links
1627
+ enable_web_search (Optional[bool]): Enable web search
1628
+ show_sources (Optional[bool]): Include source URLs
1629
+ agent (Optional[Dict[str, Any]]): Agent configuration
677
1630
 
678
1631
  Returns:
679
- Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
1632
+ ExtractResponse[Any] with:
1633
+ * success (bool): Whether request succeeded
1634
+ * data (Optional[Any]): Extracted data matching schema
1635
+ * error (Optional[str]): Error message if any
1636
+
1637
+ Raises:
1638
+ ValueError: If prompt/schema missing or extraction fails
680
1639
  """
681
1640
  headers = self._prepare_headers()
682
1641
 
683
- if not params or (not params.get('prompt') and not params.get('schema')):
1642
+ if not prompt and not schema:
684
1643
  raise ValueError("Either prompt or schema is required")
685
1644
 
686
- if not urls and not params.get('prompt'):
1645
+ if not urls and not prompt:
687
1646
  raise ValueError("Either urls or prompt is required")
688
1647
 
689
- schema = params.get('schema')
690
1648
  if schema:
691
1649
  if hasattr(schema, 'model_json_schema'):
692
1650
  # Convert Pydantic model to JSON schema
@@ -694,26 +1652,22 @@ class FirecrawlApp:
694
1652
  # Otherwise assume it's already a JSON schema dict
695
1653
 
696
1654
  request_data = {
697
- 'urls': urls,
698
- 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
699
- 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)),
700
- 'showSources': params.get('show_sources', params.get('showSources', False)),
1655
+ 'urls': urls or [],
1656
+ 'allowExternalLinks': allow_external_links,
1657
+ 'enableWebSearch': enable_web_search,
1658
+ 'showSources': show_sources,
701
1659
  'schema': schema,
702
- 'origin': 'api-sdk'
1660
+ 'origin': f'python-sdk@{get_version()}'
703
1661
  }
704
1662
 
705
- if not request_data['urls']:
706
- request_data['urls'] = []
707
1663
  # Only add prompt and systemPrompt if they exist
708
- if params.get('prompt'):
709
- request_data['prompt'] = params['prompt']
710
- if params.get('system_prompt'):
711
- request_data['systemPrompt'] = params['system_prompt']
712
- elif params.get('systemPrompt'): # Check legacy field name
713
- request_data['systemPrompt'] = params['systemPrompt']
1664
+ if prompt:
1665
+ request_data['prompt'] = prompt
1666
+ if system_prompt:
1667
+ request_data['systemPrompt'] = system_prompt
714
1668
 
715
- if params.get('agent'):
716
- request_data['agent'] = params['agent']
1669
+ if agent:
1670
+ request_data['agent'] = agent
717
1671
 
718
1672
  try:
719
1673
  # Send the initial extract request
@@ -744,10 +1698,7 @@ class FirecrawlApp:
744
1698
  except:
745
1699
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
746
1700
  if status_data['status'] == 'completed':
747
- if status_data['success']:
748
- return status_data
749
- else:
750
- raise Exception(f'Failed to extract. Error: {status_data["error"]}')
1701
+ return ExtractResponse(**status_data)
751
1702
  elif status_data['status'] in ['failed', 'cancelled']:
752
1703
  raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
753
1704
  else:
@@ -761,9 +1712,9 @@ class FirecrawlApp:
761
1712
  except Exception as e:
762
1713
  raise ValueError(str(e), 500)
763
1714
 
764
- return {'success': False, 'error': "Internal server error."}
1715
+ return ExtractResponse(success=False, error="Internal server error.")
765
1716
 
766
- def get_extract_status(self, job_id: str) -> Dict[str, Any]:
1717
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
767
1718
  """
768
1719
  Retrieve the status of an extract job.
769
1720
 
@@ -771,7 +1722,7 @@ class FirecrawlApp:
771
1722
  job_id (str): The ID of the extract job.
772
1723
 
773
1724
  Returns:
774
- Dict[str, Any]: The status of the extract job.
1725
+ ExtractResponse[Any]: The status of the extract job.
775
1726
 
776
1727
  Raises:
777
1728
  ValueError: If there is an error retrieving the status.
@@ -781,7 +1732,7 @@ class FirecrawlApp:
781
1732
  response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
782
1733
  if response.status_code == 200:
783
1734
  try:
784
- return response.json()
1735
+ return ExtractResponse(**response.json())
785
1736
  except:
786
1737
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
787
1738
  else:
@@ -789,43 +1740,71 @@ class FirecrawlApp:
789
1740
  except Exception as e:
790
1741
  raise ValueError(str(e), 500)
791
1742
 
792
- def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
1743
+ def async_extract(
1744
+ self,
1745
+ urls: List[str],
1746
+ *,
1747
+ prompt: Optional[str] = None,
1748
+ schema: Optional[Any] = None,
1749
+ system_prompt: Optional[str] = None,
1750
+ allow_external_links: Optional[bool] = False,
1751
+ enable_web_search: Optional[bool] = False,
1752
+ show_sources: Optional[bool] = False,
1753
+ agent: Optional[Dict[str, Any]] = None,
1754
+ idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
793
1755
  """
794
1756
  Initiate an asynchronous extract job.
795
1757
 
796
1758
  Args:
797
- urls (List[str]): The URLs to extract data from.
798
- params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
799
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
1759
+ urls (List[str]): URLs to extract information from
1760
+ prompt (Optional[str]): Custom extraction prompt
1761
+ schema (Optional[Any]): JSON schema/Pydantic model
1762
+ system_prompt (Optional[str]): System context
1763
+ allow_external_links (Optional[bool]): Follow external links
1764
+ enable_web_search (Optional[bool]): Enable web search
1765
+ show_sources (Optional[bool]): Include source URLs
1766
+ agent (Optional[Dict[str, Any]]): Agent configuration
1767
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
800
1768
 
801
1769
  Returns:
802
- Dict[str, Any]: The response from the extract operation.
1770
+ ExtractResponse[Any] with:
1771
+ * success (bool): Whether request succeeded
1772
+ * data (Optional[Any]): Extracted data matching schema
1773
+ * error (Optional[str]): Error message if any
803
1774
 
804
1775
  Raises:
805
- ValueError: If there is an error initiating the extract job.
1776
+ ValueError: If job initiation fails
806
1777
  """
807
1778
  headers = self._prepare_headers(idempotency_key)
808
1779
 
809
- schema = params.get('schema') if params else None
1780
+ schema = schema
810
1781
  if schema:
811
1782
  if hasattr(schema, 'model_json_schema'):
812
1783
  # Convert Pydantic model to JSON schema
813
1784
  schema = schema.model_json_schema()
814
1785
  # Otherwise assume it's already a JSON schema dict
815
1786
 
816
- jsonData = {'urls': urls, **(params or {})}
817
1787
  request_data = {
818
- **jsonData,
819
- 'allowExternalLinks': params.get('allow_external_links', False) if params else False,
1788
+ 'urls': urls,
1789
+ 'allowExternalLinks': allow_external_links,
1790
+ 'enableWebSearch': enable_web_search,
1791
+ 'showSources': show_sources,
820
1792
  'schema': schema,
821
- 'origin': 'api-sdk'
1793
+ 'origin': f'python-sdk@{version}'
822
1794
  }
823
1795
 
1796
+ if prompt:
1797
+ request_data['prompt'] = prompt
1798
+ if system_prompt:
1799
+ request_data['systemPrompt'] = system_prompt
1800
+ if agent:
1801
+ request_data['agent'] = agent
1802
+
824
1803
  try:
825
1804
  response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
826
1805
  if response.status_code == 200:
827
1806
  try:
828
- return response.json()
1807
+ return ExtractResponse(**response.json())
829
1808
  except:
830
1809
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
831
1810
  else:
@@ -833,34 +1812,44 @@ class FirecrawlApp:
833
1812
  except Exception as e:
834
1813
  raise ValueError(str(e), 500)
835
1814
 
836
- def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
1815
+ def generate_llms_text(
1816
+ self,
1817
+ url: str,
1818
+ *,
1819
+ max_urls: Optional[int] = None,
1820
+ show_full_text: Optional[bool] = None,
1821
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
837
1822
  """
838
1823
  Generate LLMs.txt for a given URL and poll until completion.
839
1824
 
840
1825
  Args:
841
- url (str): The URL to generate LLMs.txt from.
842
- params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
1826
+ url (str): Target URL to generate LLMs.txt from
1827
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1828
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1829
+ experimental_stream (Optional[bool]): Enable experimental streaming
843
1830
 
844
1831
  Returns:
845
- Dict[str, Any]: A dictionary containing the generation results. The structure includes:
846
- - 'success' (bool): Indicates if the generation was successful.
847
- - 'status' (str): The final status of the generation job.
848
- - 'data' (Dict): The generated LLMs.txt data.
849
- - 'error' (Optional[str]): Error message if the generation failed.
850
- - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
1832
+ GenerateLLMsTextStatusResponse with:
1833
+ * Generated LLMs.txt content
1834
+ * Full version if requested
1835
+ * Generation status
1836
+ * Success/error information
851
1837
 
852
1838
  Raises:
853
- Exception: If the generation job fails or an error occurs during status checks.
1839
+ Exception: If generation fails
854
1840
  """
855
- if params is None:
856
- params = {}
857
-
858
- if isinstance(params, dict):
859
- generation_params = GenerateLLMsTextParams(**params)
860
- else:
861
- generation_params = params
1841
+ params = GenerateLLMsTextParams(
1842
+ maxUrls=max_urls,
1843
+ showFullText=show_full_text,
1844
+ __experimental_stream=experimental_stream
1845
+ )
862
1846
 
863
- response = self.async_generate_llms_text(url, generation_params)
1847
+ response = self.async_generate_llms_text(
1848
+ url,
1849
+ max_urls=max_urls,
1850
+ show_full_text=show_full_text,
1851
+ experimental_stream=experimental_stream
1852
+ )
864
1853
  if not response.get('success') or 'id' not in response:
865
1854
  return response
866
1855
 
@@ -879,32 +1868,40 @@ class FirecrawlApp:
879
1868
 
880
1869
  return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
881
1870
 
882
- def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]:
1871
+ def async_generate_llms_text(
1872
+ self,
1873
+ url: str,
1874
+ *,
1875
+ max_urls: Optional[int] = None,
1876
+ show_full_text: Optional[bool] = None,
1877
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
883
1878
  """
884
1879
  Initiate an asynchronous LLMs.txt generation operation.
885
1880
 
886
1881
  Args:
887
- url (str): The URL to generate LLMs.txt from.
888
- params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation.
1882
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1883
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1884
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1885
+ experimental_stream (Optional[bool]): Enable experimental streaming
889
1886
 
890
1887
  Returns:
891
- Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes:
892
- - 'success' (bool): Indicates if the generation initiation was successful.
893
- - 'id' (str): The unique identifier for the generation job.
1888
+ GenerateLLMsTextResponse: A response containing:
1889
+ * success (bool): Whether the generation initiation was successful
1890
+ * id (str): The unique identifier for the generation job
1891
+ * error (str, optional): Error message if initiation failed
894
1892
 
895
1893
  Raises:
896
1894
  Exception: If the generation job initiation fails.
897
1895
  """
898
- if params is None:
899
- params = {}
900
-
901
- if isinstance(params, dict):
902
- generation_params = GenerateLLMsTextParams(**params)
903
- else:
904
- generation_params = params
1896
+ params = GenerateLLMsTextParams(
1897
+ maxUrls=max_urls,
1898
+ showFullText=show_full_text,
1899
+ __experimental_stream=experimental_stream
1900
+ )
905
1901
 
906
1902
  headers = self._prepare_headers()
907
- json_data = {'url': url, **generation_params.dict(exclude_none=True)}
1903
+ json_data = {'url': url, **params.dict(exclude_none=True)}
1904
+ json_data['origin'] = f"python-sdk@{version}"
908
1905
 
909
1906
  try:
910
1907
  response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
@@ -920,15 +1917,22 @@ class FirecrawlApp:
920
1917
 
921
1918
  return {'success': False, 'error': 'Internal server error'}
922
1919
 
923
- def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]:
1920
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
924
1921
  """
925
1922
  Check the status of a LLMs.txt generation operation.
926
1923
 
927
1924
  Args:
928
- id (str): The ID of the LLMs.txt generation operation.
1925
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
929
1926
 
930
1927
  Returns:
931
- Dict[str, Any]: The current status and results of the generation operation.
1928
+ GenerateLLMsTextStatusResponse: A response containing:
1929
+ * success (bool): Whether the generation was successful
1930
+ * status (str): Status of generation ("processing", "completed", "failed")
1931
+ * data (Dict[str, str], optional): Generated text with fields:
1932
+ * llmstxt (str): Generated LLMs.txt content
1933
+ * llmsfulltxt (str, optional): Full version if requested
1934
+ * error (str, optional): Error message if generation failed
1935
+ * expiresAt (str): When the generated data expires
932
1936
 
933
1937
  Raises:
934
1938
  Exception: If the status check fails.
@@ -950,7 +1954,9 @@ class FirecrawlApp:
950
1954
 
951
1955
  return {'success': False, 'error': 'Internal server error'}
952
1956
 
953
- def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
1957
+ def _prepare_headers(
1958
+ self,
1959
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
954
1960
  """
955
1961
  Prepare the headers for API requests.
956
1962
 
@@ -972,11 +1978,13 @@ class FirecrawlApp:
972
1978
  'Authorization': f'Bearer {self.api_key}',
973
1979
  }
974
1980
 
975
- def _post_request(self, url: str,
976
- data: Dict[str, Any],
977
- headers: Dict[str, str],
978
- retries: int = 3,
979
- backoff_factor: float = 0.5) -> requests.Response:
1981
+ def _post_request(
1982
+ self,
1983
+ url: str,
1984
+ data: Dict[str, Any],
1985
+ headers: Dict[str, str],
1986
+ retries: int = 3,
1987
+ backoff_factor: float = 0.5) -> requests.Response:
980
1988
  """
981
1989
  Make a POST request with retries.
982
1990
 
@@ -1001,10 +2009,12 @@ class FirecrawlApp:
1001
2009
  return response
1002
2010
  return response
1003
2011
 
1004
- def _get_request(self, url: str,
1005
- headers: Dict[str, str],
1006
- retries: int = 3,
1007
- backoff_factor: float = 0.5) -> requests.Response:
2012
+ def _get_request(
2013
+ self,
2014
+ url: str,
2015
+ headers: Dict[str, str],
2016
+ retries: int = 3,
2017
+ backoff_factor: float = 0.5) -> requests.Response:
1008
2018
  """
1009
2019
  Make a GET request with retries.
1010
2020
 
@@ -1028,10 +2038,12 @@ class FirecrawlApp:
1028
2038
  return response
1029
2039
  return response
1030
2040
 
1031
- def _delete_request(self, url: str,
1032
- headers: Dict[str, str],
1033
- retries: int = 3,
1034
- backoff_factor: float = 0.5) -> requests.Response:
2041
+ def _delete_request(
2042
+ self,
2043
+ url: str,
2044
+ headers: Dict[str, str],
2045
+ retries: int = 3,
2046
+ backoff_factor: float = 0.5) -> requests.Response:
1035
2047
  """
1036
2048
  Make a DELETE request with retries.
1037
2049
 
@@ -1055,16 +2067,21 @@ class FirecrawlApp:
1055
2067
  return response
1056
2068
  return response
1057
2069
 
1058
- def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any:
2070
+ def _monitor_job_status(
2071
+ self,
2072
+ id: str,
2073
+ headers: Dict[str, str],
2074
+ poll_interval: int) -> CrawlStatusResponse:
1059
2075
  """
1060
2076
  Monitor the status of a crawl job until completion.
1061
2077
 
1062
2078
  Args:
1063
2079
  id (str): The ID of the crawl job.
1064
2080
  headers (Dict[str, str]): The headers to include in the status check requests.
1065
- poll_interval (int): Secounds between status checks.
2081
+ poll_interval (int): Seconds between status checks.
2082
+
1066
2083
  Returns:
1067
- Any: The crawl results if the job is completed successfully.
2084
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
1068
2085
 
1069
2086
  Raises:
1070
2087
  Exception: If the job fails or an error occurs during status checks.
@@ -1091,7 +2108,7 @@ class FirecrawlApp:
1091
2108
  raise Exception(f'Failed to parse Firecrawl response as JSON.')
1092
2109
  data.extend(status_data.get('data', []))
1093
2110
  status_data['data'] = data
1094
- return status_data
2111
+ return CrawlStatusResponse(**status_data)
1095
2112
  else:
1096
2113
  raise Exception('Crawl job completed but no data was returned')
1097
2114
  elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
@@ -1102,7 +2119,10 @@ class FirecrawlApp:
1102
2119
  else:
1103
2120
  self._handle_error(status_response, 'check crawl status')
1104
2121
 
1105
- def _handle_error(self, response: requests.Response, action: str) -> None:
2122
+ def _handle_error(
2123
+ self,
2124
+ response: requests.Response,
2125
+ action: str) -> None:
1106
2126
  """
1107
2127
  Handle errors from API responses.
1108
2128
 
@@ -1119,49 +2139,100 @@ class FirecrawlApp:
1119
2139
  except:
1120
2140
  raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
1121
2141
 
1122
-
1123
- if response.status_code == 402:
1124
- message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
1125
- elif response.status_code == 403:
1126
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
1127
- elif response.status_code == 408:
1128
- message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
1129
- elif response.status_code == 409:
1130
- message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
1131
- elif response.status_code == 500:
1132
- message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
1133
- else:
1134
- message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
2142
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
1135
2143
 
1136
2144
  # Raise an HTTPError with the custom message and attach the response
1137
2145
  raise requests.exceptions.HTTPError(message, response=response)
1138
2146
 
1139
- def deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None,
1140
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
1141
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> Dict[str, Any]:
2147
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2148
+ """
2149
+ Generate a standardized error message based on HTTP status code.
2150
+
2151
+ Args:
2152
+ status_code (int): The HTTP status code from the response
2153
+ action (str): Description of the action that was being performed
2154
+ error_message (str): The error message from the API response
2155
+ error_details (str): Additional error details from the API response
2156
+
2157
+ Returns:
2158
+ str: A formatted error message
2159
+ """
2160
+ if status_code == 402:
2161
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2162
+ elif status_code == 403:
2163
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2164
+ elif status_code == 408:
2165
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2166
+ elif status_code == 409:
2167
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2168
+ elif status_code == 500:
2169
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2170
+ else:
2171
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2172
+
2173
+ def deep_research(
2174
+ self,
2175
+ query: str,
2176
+ *,
2177
+ max_depth: Optional[int] = None,
2178
+ time_limit: Optional[int] = None,
2179
+ max_urls: Optional[int] = None,
2180
+ analysis_prompt: Optional[str] = None,
2181
+ system_prompt: Optional[str] = None,
2182
+ __experimental_stream_steps: Optional[bool] = None,
2183
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2184
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
1142
2185
  """
1143
2186
  Initiates a deep research operation on a given query and polls until completion.
1144
2187
 
1145
2188
  Args:
1146
- query (str): The query to research.
1147
- params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation.
1148
- on_activity (Optional[Callable[[Dict[str, Any]], None]]): Optional callback to receive activity updates in real-time.
2189
+ query (str): Research query or topic to investigate
2190
+ max_depth (Optional[int]): Maximum depth of research exploration
2191
+ time_limit (Optional[int]): Time limit in seconds for research
2192
+ max_urls (Optional[int]): Maximum number of URLs to process
2193
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2194
+ system_prompt (Optional[str]): Custom system prompt
2195
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2196
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2197
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
1149
2198
 
1150
2199
  Returns:
1151
- Dict[str, Any]: The final research results.
2200
+ DeepResearchStatusResponse containing:
2201
+ * success (bool): Whether research completed successfully
2202
+ * status (str): Current state (processing/completed/failed)
2203
+ * error (Optional[str]): Error message if failed
2204
+ * id (str): Unique identifier for the research job
2205
+ * data (Any): Research findings and analysis
2206
+ * sources (List[Dict]): List of discovered sources
2207
+ * activities (List[Dict]): Research progress log
2208
+ * summaries (List[str]): Generated research summaries
1152
2209
 
1153
2210
  Raises:
1154
- Exception: If the research operation fails.
2211
+ Exception: If research fails
1155
2212
  """
1156
- if params is None:
1157
- params = {}
1158
-
1159
- if isinstance(params, dict):
1160
- research_params = DeepResearchParams(**params)
1161
- else:
1162
- research_params = params
1163
-
1164
- response = self.async_deep_research(query, research_params)
2213
+ research_params = {}
2214
+ if max_depth is not None:
2215
+ research_params['maxDepth'] = max_depth
2216
+ if time_limit is not None:
2217
+ research_params['timeLimit'] = time_limit
2218
+ if max_urls is not None:
2219
+ research_params['maxUrls'] = max_urls
2220
+ if analysis_prompt is not None:
2221
+ research_params['analysisPrompt'] = analysis_prompt
2222
+ if system_prompt is not None:
2223
+ research_params['systemPrompt'] = system_prompt
2224
+ if __experimental_stream_steps is not None:
2225
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2226
+ research_params = DeepResearchParams(**research_params)
2227
+
2228
+ response = self.async_deep_research(
2229
+ query,
2230
+ max_depth=max_depth,
2231
+ time_limit=time_limit,
2232
+ max_urls=max_urls,
2233
+ analysis_prompt=analysis_prompt,
2234
+ system_prompt=system_prompt
2235
+ )
1165
2236
  if not response.get('success') or 'id' not in response:
1166
2237
  return response
1167
2238
 
@@ -1194,31 +2265,57 @@ class FirecrawlApp:
1194
2265
  time.sleep(2) # Polling interval
1195
2266
 
1196
2267
  return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
1197
- def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
2268
+
2269
+ def async_deep_research(
2270
+ self,
2271
+ query: str,
2272
+ *,
2273
+ max_depth: Optional[int] = None,
2274
+ time_limit: Optional[int] = None,
2275
+ max_urls: Optional[int] = None,
2276
+ analysis_prompt: Optional[str] = None,
2277
+ system_prompt: Optional[str] = None,
2278
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
1198
2279
  """
1199
2280
  Initiates an asynchronous deep research operation.
1200
2281
 
1201
2282
  Args:
1202
- query (str): The query to research.
1203
- params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation.
2283
+ query (str): Research query or topic to investigate
2284
+ max_depth (Optional[int]): Maximum depth of research exploration
2285
+ time_limit (Optional[int]): Time limit in seconds for research
2286
+ max_urls (Optional[int]): Maximum number of URLs to process
2287
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2288
+ system_prompt (Optional[str]): Custom system prompt
2289
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
1204
2290
 
1205
2291
  Returns:
1206
- Dict[str, Any]: The response from the deep research initiation.
2292
+ Dict[str, Any]: A response containing:
2293
+ * success (bool): Whether the research initiation was successful
2294
+ * id (str): The unique identifier for the research job
2295
+ * error (str, optional): Error message if initiation failed
1207
2296
 
1208
2297
  Raises:
1209
2298
  Exception: If the research initiation fails.
1210
2299
  """
1211
- if params is None:
1212
- params = {}
1213
-
1214
- if isinstance(params, dict):
1215
- research_params = DeepResearchParams(**params)
1216
- else:
1217
- research_params = params
2300
+ research_params = {}
2301
+ if max_depth is not None:
2302
+ research_params['maxDepth'] = max_depth
2303
+ if time_limit is not None:
2304
+ research_params['timeLimit'] = time_limit
2305
+ if max_urls is not None:
2306
+ research_params['maxUrls'] = max_urls
2307
+ if analysis_prompt is not None:
2308
+ research_params['analysisPrompt'] = analysis_prompt
2309
+ if system_prompt is not None:
2310
+ research_params['systemPrompt'] = system_prompt
2311
+ if __experimental_stream_steps is not None:
2312
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2313
+ research_params = DeepResearchParams(**research_params)
1218
2314
 
1219
2315
  headers = self._prepare_headers()
1220
2316
 
1221
2317
  json_data = {'query': query, **research_params.dict(exclude_none=True)}
2318
+ json_data['origin'] = f"python-sdk@{version}"
1222
2319
 
1223
2320
  # Handle json options schema if present
1224
2321
  if 'jsonOptions' in json_data:
@@ -1240,7 +2337,7 @@ class FirecrawlApp:
1240
2337
 
1241
2338
  return {'success': False, 'error': 'Internal server error'}
1242
2339
 
1243
- def check_deep_research_status(self, id: str) -> Dict[str, Any]:
2340
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
1244
2341
  """
1245
2342
  Check the status of a deep research operation.
1246
2343
 
@@ -1248,7 +2345,19 @@ class FirecrawlApp:
1248
2345
  id (str): The ID of the deep research operation.
1249
2346
 
1250
2347
  Returns:
1251
- Dict[str, Any]: The current status and results of the research operation.
2348
+ DeepResearchResponse containing:
2349
+
2350
+ Status:
2351
+ * success - Whether research completed successfully
2352
+ * status - Current state (processing/completed/failed)
2353
+ * error - Error message if failed
2354
+
2355
+ Results:
2356
+ * id - Unique identifier for the research job
2357
+ * data - Research findings and analysis
2358
+ * sources - List of discovered sources
2359
+ * activities - Research progress log
2360
+ * summaries - Generated research summaries
1252
2361
 
1253
2362
  Raises:
1254
2363
  Exception: If the status check fails.
@@ -1271,6 +2380,17 @@ class FirecrawlApp:
1271
2380
  return {'success': False, 'error': 'Internal server error'}
1272
2381
 
1273
2382
  class CrawlWatcher:
2383
+ """
2384
+ A class to watch and handle crawl job events via WebSocket connection.
2385
+
2386
+ Attributes:
2387
+ id (str): The ID of the crawl job to watch
2388
+ app (FirecrawlApp): The FirecrawlApp instance
2389
+ data (List[Dict[str, Any]]): List of crawled documents/data
2390
+ status (str): Current status of the crawl job
2391
+ ws_url (str): WebSocket URL for the crawl job
2392
+ event_handlers (dict): Dictionary of event type to list of handler functions
2393
+ """
1274
2394
  def __init__(self, id: str, app: FirecrawlApp):
1275
2395
  self.id = id
1276
2396
  self.app = app
@@ -1283,25 +2403,57 @@ class CrawlWatcher:
1283
2403
  'document': []
1284
2404
  }
1285
2405
 
1286
- async def connect(self):
1287
- async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket:
2406
+ async def connect(self) -> None:
2407
+ """
2408
+ Establishes WebSocket connection and starts listening for messages.
2409
+ """
2410
+ async with websockets.connect(
2411
+ self.ws_url,
2412
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2413
+ ) as websocket:
1288
2414
  await self._listen(websocket)
1289
2415
 
1290
- async def _listen(self, websocket):
2416
+ async def _listen(self, websocket) -> None:
2417
+ """
2418
+ Listens for incoming WebSocket messages and handles them.
2419
+
2420
+ Args:
2421
+ websocket: The WebSocket connection object
2422
+ """
1291
2423
  async for message in websocket:
1292
2424
  msg = json.loads(message)
1293
2425
  await self._handle_message(msg)
1294
2426
 
1295
- def add_event_listener(self, event_type: str, handler):
2427
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2428
+ """
2429
+ Adds an event handler function for a specific event type.
2430
+
2431
+ Args:
2432
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2433
+ handler (Callable): Function to handle the event
2434
+ """
1296
2435
  if event_type in self.event_handlers:
1297
2436
  self.event_handlers[event_type].append(handler)
1298
2437
 
1299
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]):
2438
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2439
+ """
2440
+ Dispatches an event to all registered handlers for that event type.
2441
+
2442
+ Args:
2443
+ event_type (str): Type of event to dispatch
2444
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2445
+ """
1300
2446
  if event_type in self.event_handlers:
1301
2447
  for handler in self.event_handlers[event_type]:
1302
2448
  handler(detail)
1303
2449
 
1304
- async def _handle_message(self, msg: Dict[str, Any]):
2450
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2451
+ """
2452
+ Handles incoming WebSocket messages based on their type.
2453
+
2454
+ Args:
2455
+ msg (Dict[str, Any]): The message to handle
2456
+ """
1305
2457
  if msg['type'] == 'done':
1306
2458
  self.status = 'completed'
1307
2459
  self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
@@ -1316,3 +2468,1773 @@ class CrawlWatcher:
1316
2468
  elif msg['type'] == 'document':
1317
2469
  self.data.append(msg['data'])
1318
2470
  self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2471
+
2472
+ class AsyncFirecrawlApp(FirecrawlApp):
2473
+ """
2474
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2475
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2476
+ """
2477
+
2478
+ async def _async_request(
2479
+ self,
2480
+ method: str,
2481
+ url: str,
2482
+ headers: Dict[str, str],
2483
+ data: Optional[Dict[str, Any]] = None,
2484
+ retries: int = 3,
2485
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2486
+ """
2487
+ Generic async request method with exponential backoff retry logic.
2488
+
2489
+ Args:
2490
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2491
+ url (str): The URL to send the request to.
2492
+ headers (Dict[str, str]): Headers to include in the request.
2493
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2494
+ retries (int): Maximum number of retry attempts (default: 3).
2495
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2496
+ Delay will be backoff_factor * (2 ** retry_count).
2497
+
2498
+ Returns:
2499
+ Dict[str, Any]: The parsed JSON response from the server.
2500
+
2501
+ Raises:
2502
+ aiohttp.ClientError: If the request fails after all retries.
2503
+ Exception: If max retries are exceeded or other errors occur.
2504
+ """
2505
+ async with aiohttp.ClientSession() as session:
2506
+ for attempt in range(retries):
2507
+ try:
2508
+ async with session.request(
2509
+ method=method, url=url, headers=headers, json=data
2510
+ ) as response:
2511
+ if response.status == 502:
2512
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2513
+ continue
2514
+ if response.status >= 300:
2515
+ await self._handle_error(response, f"make {method} request")
2516
+ return await response.json()
2517
+ except aiohttp.ClientError as e:
2518
+ if attempt == retries - 1:
2519
+ raise e
2520
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2521
+ raise Exception("Max retries exceeded")
2522
+
2523
+ async def _async_post_request(
2524
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2525
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2526
+ """
2527
+ Make an async POST request with exponential backoff retry logic.
2528
+
2529
+ Args:
2530
+ url (str): The URL to send the POST request to.
2531
+ data (Dict[str, Any]): The JSON data to include in the request body.
2532
+ headers (Dict[str, str]): Headers to include in the request.
2533
+ retries (int): Maximum number of retry attempts (default: 3).
2534
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2535
+ Delay will be backoff_factor * (2 ** retry_count).
2536
+
2537
+ Returns:
2538
+ Dict[str, Any]: The parsed JSON response from the server.
2539
+
2540
+ Raises:
2541
+ aiohttp.ClientError: If the request fails after all retries.
2542
+ Exception: If max retries are exceeded or other errors occur.
2543
+ """
2544
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2545
+
2546
+ async def _async_get_request(
2547
+ self, url: str, headers: Dict[str, str],
2548
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2549
+ """
2550
+ Make an async GET request with exponential backoff retry logic.
2551
+
2552
+ Args:
2553
+ url (str): The URL to send the GET request to.
2554
+ headers (Dict[str, str]): Headers to include in the request.
2555
+ retries (int): Maximum number of retry attempts (default: 3).
2556
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2557
+ Delay will be backoff_factor * (2 ** retry_count).
2558
+
2559
+ Returns:
2560
+ Dict[str, Any]: The parsed JSON response from the server.
2561
+
2562
+ Raises:
2563
+ aiohttp.ClientError: If the request fails after all retries.
2564
+ Exception: If max retries are exceeded or other errors occur.
2565
+ """
2566
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2567
+
2568
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2569
+ """
2570
+ Handle errors from async API responses with detailed error messages.
2571
+
2572
+ Args:
2573
+ response (aiohttp.ClientResponse): The response object from the failed request
2574
+ action (str): Description of the action that was being attempted
2575
+
2576
+ Raises:
2577
+ aiohttp.ClientError: With a detailed error message based on the response status:
2578
+ - 402: Payment Required
2579
+ - 408: Request Timeout
2580
+ - 409: Conflict
2581
+ - 500: Internal Server Error
2582
+ - Other: Unexpected error with status code
2583
+ """
2584
+ try:
2585
+ error_data = await response.json()
2586
+ error_message = error_data.get('error', 'No error message provided.')
2587
+ error_details = error_data.get('details', 'No additional error details provided.')
2588
+ except:
2589
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2590
+
2591
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2592
+
2593
+ raise aiohttp.ClientError(message)
2594
+
2595
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2596
+ """
2597
+ Generate a standardized error message based on HTTP status code for async operations.
2598
+
2599
+ Args:
2600
+ status_code (int): The HTTP status code from the response
2601
+ action (str): Description of the action that was being performed
2602
+ error_message (str): The error message from the API response
2603
+ error_details (str): Additional error details from the API response
2604
+
2605
+ Returns:
2606
+ str: A formatted error message
2607
+ """
2608
+ return self._get_error_message(status_code, action, error_message, error_details)
2609
+
2610
+ async def crawl_url_and_watch(
2611
+ self,
2612
+ url: str,
2613
+ params: Optional[CrawlParams] = None,
2614
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2615
+ """
2616
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2617
+
2618
+ Args:
2619
+ url (str): Target URL to start crawling from
2620
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2621
+ URL Discovery:
2622
+ * includePaths - Patterns of URLs to include
2623
+ * excludePaths - Patterns of URLs to exclude
2624
+ * maxDepth - Maximum crawl depth
2625
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2626
+ * limit - Maximum pages to crawl
2627
+
2628
+ Link Following:
2629
+ * allowBackwardLinks - Follow parent directory links
2630
+ * allowExternalLinks - Follow external domain links
2631
+ * ignoreSitemap - Skip sitemap.xml processing
2632
+
2633
+ Advanced:
2634
+ * scrapeOptions - Page scraping configuration
2635
+ * webhook - Notification webhook settings
2636
+ * deduplicateSimilarURLs - Remove similar URLs
2637
+ * ignoreQueryParameters - Ignore URL parameters
2638
+ * regexOnFullURL - Apply regex to full URLs
2639
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2640
+
2641
+ Returns:
2642
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2643
+
2644
+ Raises:
2645
+ Exception: If crawl job fails to start
2646
+ """
2647
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2648
+ if crawl_response.get('success') and 'id' in crawl_response:
2649
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2650
+ else:
2651
+ raise Exception("Crawl job failed to start")
2652
+
2653
+ async def batch_scrape_urls_and_watch(
2654
+ self,
2655
+ urls: List[str],
2656
+ params: Optional[ScrapeParams] = None,
2657
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2658
+ """
2659
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2660
+
2661
+ Args:
2662
+ urls (List[str]): List of URLs to scrape
2663
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2664
+
2665
+ Content Options:
2666
+ * formats - Content formats to retrieve
2667
+ * includeTags - HTML tags to include
2668
+ * excludeTags - HTML tags to exclude
2669
+ * onlyMainContent - Extract main content only
2670
+
2671
+ Request Options:
2672
+ * headers - Custom HTTP headers
2673
+ * timeout - Request timeout (ms)
2674
+ * mobile - Use mobile user agent
2675
+ * proxy - Proxy type
2676
+
2677
+ Extraction Options:
2678
+ * extract - Content extraction config
2679
+ * jsonOptions - JSON extraction config
2680
+ * actions - Actions to perform
2681
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2682
+
2683
+ Returns:
2684
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2685
+
2686
+ Raises:
2687
+ Exception: If batch scrape job fails to start
2688
+ """
2689
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2690
+ if batch_response.get('success') and 'id' in batch_response:
2691
+ return AsyncCrawlWatcher(batch_response['id'], self)
2692
+ else:
2693
+ raise Exception("Batch scrape job failed to start")
2694
+
2695
+ async def scrape_url(
2696
+ self,
2697
+ url: str,
2698
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2699
+ include_tags: Optional[List[str]] = None,
2700
+ exclude_tags: Optional[List[str]] = None,
2701
+ only_main_content: Optional[bool] = None,
2702
+ wait_for: Optional[int] = None,
2703
+ timeout: Optional[int] = None,
2704
+ location: Optional[LocationConfig] = None,
2705
+ mobile: Optional[bool] = None,
2706
+ skip_tls_verification: Optional[bool] = None,
2707
+ remove_base64_images: Optional[bool] = None,
2708
+ block_ads: Optional[bool] = None,
2709
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2710
+ extract: Optional[ExtractConfig] = None,
2711
+ json_options: Optional[ExtractConfig] = None,
2712
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
2713
+ """
2714
+ Scrape and extract content from a URL asynchronously.
2715
+
2716
+ Args:
2717
+ url (str): Target URL to scrape
2718
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2719
+ include_tags (Optional[List[str]]): HTML tags to include
2720
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2721
+ only_main_content (Optional[bool]): Extract main content only
2722
+ wait_for (Optional[int]): Wait for a specific element to appear
2723
+ timeout (Optional[int]): Request timeout (ms)
2724
+ location (Optional[LocationConfig]): Location configuration
2725
+ mobile (Optional[bool]): Use mobile user agent
2726
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2727
+ remove_base64_images (Optional[bool]): Remove base64 images
2728
+ block_ads (Optional[bool]): Block ads
2729
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2730
+ extract (Optional[ExtractConfig]): Content extraction settings
2731
+ json_options (Optional[ExtractConfig]): JSON extraction settings
2732
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2733
+
2734
+ Returns:
2735
+ ScrapeResponse with:
2736
+ * Requested content formats
2737
+ * Page metadata
2738
+ * Extraction results
2739
+ * Success/error status
2740
+
2741
+ Raises:
2742
+ Exception: If scraping fails
2743
+ """
2744
+ headers = self._prepare_headers()
2745
+
2746
+ # Build scrape parameters
2747
+ scrape_params = {
2748
+ 'url': url,
2749
+ 'origin': f"python-sdk@{version}"
2750
+ }
2751
+
2752
+ # Add optional parameters if provided and not None
2753
+ if formats:
2754
+ scrape_params['formats'] = formats
2755
+ if include_tags:
2756
+ scrape_params['includeTags'] = include_tags
2757
+ if exclude_tags:
2758
+ scrape_params['excludeTags'] = exclude_tags
2759
+ if only_main_content is not None:
2760
+ scrape_params['onlyMainContent'] = only_main_content
2761
+ if wait_for:
2762
+ scrape_params['waitFor'] = wait_for
2763
+ if timeout:
2764
+ scrape_params['timeout'] = timeout
2765
+ if location:
2766
+ scrape_params['location'] = location.dict(exclude_none=True)
2767
+ if mobile is not None:
2768
+ scrape_params['mobile'] = mobile
2769
+ if skip_tls_verification is not None:
2770
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2771
+ if remove_base64_images is not None:
2772
+ scrape_params['removeBase64Images'] = remove_base64_images
2773
+ if block_ads is not None:
2774
+ scrape_params['blockAds'] = block_ads
2775
+ if proxy:
2776
+ scrape_params['proxy'] = proxy
2777
+ if extract:
2778
+ extract_dict = extract.dict(exclude_none=True)
2779
+ if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
2780
+ extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
2781
+ scrape_params['extract'] = extract_dict
2782
+ if json_options:
2783
+ json_options_dict = json_options.dict(exclude_none=True)
2784
+ if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
2785
+ json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
2786
+ scrape_params['jsonOptions'] = json_options_dict
2787
+ if actions:
2788
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2789
+
2790
+ # Make async request
2791
+ endpoint = f'/v1/scrape'
2792
+ response = await self._async_post_request(
2793
+ f'{self.api_url}{endpoint}',
2794
+ scrape_params,
2795
+ headers
2796
+ )
2797
+
2798
+ if response.get('success') and 'data' in response:
2799
+ return ScrapeResponse(**response['data'])
2800
+ elif "error" in response:
2801
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2802
+ else:
2803
+ # Use the response content directly if possible, otherwise a generic message
2804
+ error_content = response.get('error', str(response))
2805
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
2806
+
2807
+ async def batch_scrape_urls(
2808
+ self,
2809
+ urls: List[str],
2810
+ *,
2811
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2812
+ headers: Optional[Dict[str, str]] = None,
2813
+ include_tags: Optional[List[str]] = None,
2814
+ exclude_tags: Optional[List[str]] = None,
2815
+ only_main_content: Optional[bool] = None,
2816
+ wait_for: Optional[int] = None,
2817
+ timeout: Optional[int] = None,
2818
+ location: Optional[LocationConfig] = None,
2819
+ mobile: Optional[bool] = None,
2820
+ skip_tls_verification: Optional[bool] = None,
2821
+ remove_base64_images: Optional[bool] = None,
2822
+ block_ads: Optional[bool] = None,
2823
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2824
+ extract: Optional[ExtractConfig] = None,
2825
+ json_options: Optional[ExtractConfig] = None,
2826
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2827
+ agent: Optional[AgentOptions] = None,
2828
+ poll_interval: Optional[int] = 2,
2829
+ idempotency_key: Optional[str] = None,
2830
+ **kwargs
2831
+ ) -> BatchScrapeStatusResponse:
2832
+ """
2833
+ Asynchronously scrape multiple URLs and monitor until completion.
2834
+
2835
+ Args:
2836
+ urls (List[str]): URLs to scrape
2837
+ formats (Optional[List[Literal]]): Content formats to retrieve
2838
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2839
+ include_tags (Optional[List[str]]): HTML tags to include
2840
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2841
+ only_main_content (Optional[bool]): Extract main content only
2842
+ wait_for (Optional[int]): Wait time in milliseconds
2843
+ timeout (Optional[int]): Request timeout in milliseconds
2844
+ location (Optional[LocationConfig]): Location configuration
2845
+ mobile (Optional[bool]): Use mobile user agent
2846
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2847
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2848
+ block_ads (Optional[bool]): Block advertisements
2849
+ proxy (Optional[Literal]): Proxy type to use
2850
+ extract (Optional[ExtractConfig]): Content extraction config
2851
+ json_options (Optional[ExtractConfig]): JSON extraction config
2852
+ actions (Optional[List[Union]]): Actions to perform
2853
+ agent (Optional[AgentOptions]): Agent configuration
2854
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
2855
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2856
+ **kwargs: Additional parameters to pass to the API
2857
+
2858
+ Returns:
2859
+ BatchScrapeStatusResponse with:
2860
+ * Scraping status and progress
2861
+ * Scraped content for each URL
2862
+ * Success/error information
2863
+
2864
+ Raises:
2865
+ Exception: If batch scrape fails
2866
+ """
2867
+ scrape_params = {}
2868
+
2869
+ # Add individual parameters
2870
+ if formats is not None:
2871
+ scrape_params['formats'] = formats
2872
+ if headers is not None:
2873
+ scrape_params['headers'] = headers
2874
+ if include_tags is not None:
2875
+ scrape_params['includeTags'] = include_tags
2876
+ if exclude_tags is not None:
2877
+ scrape_params['excludeTags'] = exclude_tags
2878
+ if only_main_content is not None:
2879
+ scrape_params['onlyMainContent'] = only_main_content
2880
+ if wait_for is not None:
2881
+ scrape_params['waitFor'] = wait_for
2882
+ if timeout is not None:
2883
+ scrape_params['timeout'] = timeout
2884
+ if location is not None:
2885
+ scrape_params['location'] = location.dict(exclude_none=True)
2886
+ if mobile is not None:
2887
+ scrape_params['mobile'] = mobile
2888
+ if skip_tls_verification is not None:
2889
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2890
+ if remove_base64_images is not None:
2891
+ scrape_params['removeBase64Images'] = remove_base64_images
2892
+ if block_ads is not None:
2893
+ scrape_params['blockAds'] = block_ads
2894
+ if proxy is not None:
2895
+ scrape_params['proxy'] = proxy
2896
+ if extract is not None:
2897
+ if hasattr(extract.schema, 'schema'):
2898
+ extract.schema = extract.schema.schema()
2899
+ scrape_params['extract'] = extract.dict(exclude_none=True)
2900
+ if json_options is not None:
2901
+ if hasattr(json_options.schema, 'schema'):
2902
+ json_options.schema = json_options.schema.schema()
2903
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
2904
+ if actions is not None:
2905
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2906
+ if agent is not None:
2907
+ scrape_params['agent'] = agent.dict(exclude_none=True)
2908
+
2909
+ # Add any additional kwargs
2910
+ scrape_params.update(kwargs)
2911
+
2912
+ # Create final params object
2913
+ final_params = ScrapeParams(**scrape_params)
2914
+ params_dict = final_params.dict(exclude_none=True)
2915
+ params_dict['urls'] = urls
2916
+ params_dict['origin'] = f"python-sdk@{version}"
2917
+
2918
+ # Make request
2919
+ headers = self._prepare_headers(idempotency_key)
2920
+ response = await self._async_post_request(
2921
+ f'{self.api_url}/v1/batch/scrape',
2922
+ params_dict,
2923
+ headers
2924
+ )
2925
+
2926
+ if response.status_code == 200:
2927
+ try:
2928
+ id = response.json().get('id')
2929
+ except:
2930
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2931
+ return self._monitor_job_status(id, headers, poll_interval)
2932
+ else:
2933
+ self._handle_error(response, 'start batch scrape job')
2934
+
2935
+
2936
+ async def async_batch_scrape_urls(
2937
+ self,
2938
+ urls: List[str],
2939
+ *,
2940
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2941
+ headers: Optional[Dict[str, str]] = None,
2942
+ include_tags: Optional[List[str]] = None,
2943
+ exclude_tags: Optional[List[str]] = None,
2944
+ only_main_content: Optional[bool] = None,
2945
+ wait_for: Optional[int] = None,
2946
+ timeout: Optional[int] = None,
2947
+ location: Optional[LocationConfig] = None,
2948
+ mobile: Optional[bool] = None,
2949
+ skip_tls_verification: Optional[bool] = None,
2950
+ remove_base64_images: Optional[bool] = None,
2951
+ block_ads: Optional[bool] = None,
2952
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2953
+ extract: Optional[ExtractConfig] = None,
2954
+ json_options: Optional[ExtractConfig] = None,
2955
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2956
+ agent: Optional[AgentOptions] = None,
2957
+ idempotency_key: Optional[str] = None,
2958
+ **kwargs
2959
+ ) -> BatchScrapeResponse:
2960
+ """
2961
+ Initiate a batch scrape job asynchronously.
2962
+
2963
+ Args:
2964
+ urls (List[str]): URLs to scrape
2965
+ formats (Optional[List[Literal]]): Content formats to retrieve
2966
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2967
+ include_tags (Optional[List[str]]): HTML tags to include
2968
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2969
+ only_main_content (Optional[bool]): Extract main content only
2970
+ wait_for (Optional[int]): Wait time in milliseconds
2971
+ timeout (Optional[int]): Request timeout in milliseconds
2972
+ location (Optional[LocationConfig]): Location configuration
2973
+ mobile (Optional[bool]): Use mobile user agent
2974
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2975
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2976
+ block_ads (Optional[bool]): Block advertisements
2977
+ proxy (Optional[Literal]): Proxy type to use
2978
+ extract (Optional[ExtractConfig]): Content extraction config
2979
+ json_options (Optional[ExtractConfig]): JSON extraction config
2980
+ actions (Optional[List[Union]]): Actions to perform
2981
+ agent (Optional[AgentOptions]): Agent configuration
2982
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2983
+ **kwargs: Additional parameters to pass to the API
2984
+
2985
+ Returns:
2986
+ BatchScrapeResponse with:
2987
+ * success - Whether job started successfully
2988
+ * id - Unique identifier for the job
2989
+ * url - Status check URL
2990
+ * error - Error message if start failed
2991
+
2992
+ Raises:
2993
+ Exception: If job initiation fails
2994
+ """
2995
+ scrape_params = {}
2996
+
2997
+ # Add individual parameters
2998
+ if formats is not None:
2999
+ scrape_params['formats'] = formats
3000
+ if headers is not None:
3001
+ scrape_params['headers'] = headers
3002
+ if include_tags is not None:
3003
+ scrape_params['includeTags'] = include_tags
3004
+ if exclude_tags is not None:
3005
+ scrape_params['excludeTags'] = exclude_tags
3006
+ if only_main_content is not None:
3007
+ scrape_params['onlyMainContent'] = only_main_content
3008
+ if wait_for is not None:
3009
+ scrape_params['waitFor'] = wait_for
3010
+ if timeout is not None:
3011
+ scrape_params['timeout'] = timeout
3012
+ if location is not None:
3013
+ scrape_params['location'] = location.dict(exclude_none=True)
3014
+ if mobile is not None:
3015
+ scrape_params['mobile'] = mobile
3016
+ if skip_tls_verification is not None:
3017
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3018
+ if remove_base64_images is not None:
3019
+ scrape_params['removeBase64Images'] = remove_base64_images
3020
+ if block_ads is not None:
3021
+ scrape_params['blockAds'] = block_ads
3022
+ if proxy is not None:
3023
+ scrape_params['proxy'] = proxy
3024
+ if extract is not None:
3025
+ if hasattr(extract.schema, 'schema'):
3026
+ extract.schema = extract.schema.schema()
3027
+ scrape_params['extract'] = extract.dict(exclude_none=True)
3028
+ if json_options is not None:
3029
+ if hasattr(json_options.schema, 'schema'):
3030
+ json_options.schema = json_options.schema.schema()
3031
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
3032
+ if actions is not None:
3033
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3034
+ if agent is not None:
3035
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3036
+
3037
+ # Add any additional kwargs
3038
+ scrape_params.update(kwargs)
3039
+
3040
+ # Create final params object
3041
+ final_params = ScrapeParams(**scrape_params)
3042
+ params_dict = final_params.dict(exclude_none=True)
3043
+ params_dict['urls'] = urls
3044
+ params_dict['origin'] = f"python-sdk@{version}"
3045
+
3046
+ # Make request
3047
+ headers = self._prepare_headers(idempotency_key)
3048
+ response = await self._async_post_request(
3049
+ f'{self.api_url}/v1/batch/scrape',
3050
+ params_dict,
3051
+ headers
3052
+ )
3053
+
3054
+ if response.status_code == 200:
3055
+ try:
3056
+ return BatchScrapeResponse(**response.json())
3057
+ except:
3058
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3059
+ else:
3060
+ self._handle_error(response, 'start batch scrape job')
3061
+
3062
+ async def crawl_url(
3063
+ self,
3064
+ url: str,
3065
+ *,
3066
+ include_paths: Optional[List[str]] = None,
3067
+ exclude_paths: Optional[List[str]] = None,
3068
+ max_depth: Optional[int] = None,
3069
+ max_discovery_depth: Optional[int] = None,
3070
+ limit: Optional[int] = None,
3071
+ allow_backward_links: Optional[bool] = None,
3072
+ allow_external_links: Optional[bool] = None,
3073
+ ignore_sitemap: Optional[bool] = None,
3074
+ scrape_options: Optional[CommonOptions] = None,
3075
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3076
+ deduplicate_similar_urls: Optional[bool] = None,
3077
+ ignore_query_parameters: Optional[bool] = None,
3078
+ regex_on_full_url: Optional[bool] = None,
3079
+ poll_interval: Optional[int] = 2,
3080
+ idempotency_key: Optional[str] = None,
3081
+ **kwargs
3082
+ ) -> CrawlStatusResponse:
3083
+ """
3084
+ Crawl a website starting from a URL.
3085
+
3086
+ Args:
3087
+ url (str): Target URL to start crawling from
3088
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3089
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3090
+ max_depth (Optional[int]): Maximum crawl depth
3091
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3092
+ limit (Optional[int]): Maximum pages to crawl
3093
+ allow_backward_links (Optional[bool]): Follow parent directory links
3094
+ allow_external_links (Optional[bool]): Follow external domain links
3095
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3096
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
3097
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3098
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3099
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3100
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3101
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3102
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3103
+ **kwargs: Additional parameters to pass to the API
3104
+
3105
+ Returns:
3106
+ CrawlStatusResponse with:
3107
+ * Crawling status and progress
3108
+ * Crawled page contents
3109
+ * Success/error information
3110
+
3111
+ Raises:
3112
+ Exception: If crawl fails
3113
+ """
3114
+ crawl_params = {}
3115
+
3116
+ # Add individual parameters
3117
+ if include_paths is not None:
3118
+ crawl_params['includePaths'] = include_paths
3119
+ if exclude_paths is not None:
3120
+ crawl_params['excludePaths'] = exclude_paths
3121
+ if max_depth is not None:
3122
+ crawl_params['maxDepth'] = max_depth
3123
+ if max_discovery_depth is not None:
3124
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3125
+ if limit is not None:
3126
+ crawl_params['limit'] = limit
3127
+ if allow_backward_links is not None:
3128
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3129
+ if allow_external_links is not None:
3130
+ crawl_params['allowExternalLinks'] = allow_external_links
3131
+ if ignore_sitemap is not None:
3132
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3133
+ if scrape_options is not None:
3134
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3135
+ if webhook is not None:
3136
+ crawl_params['webhook'] = webhook
3137
+ if deduplicate_similar_urls is not None:
3138
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3139
+ if ignore_query_parameters is not None:
3140
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3141
+ if regex_on_full_url is not None:
3142
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3143
+
3144
+ # Add any additional kwargs
3145
+ crawl_params.update(kwargs)
3146
+
3147
+ # Create final params object
3148
+ final_params = CrawlParams(**crawl_params)
3149
+ params_dict = final_params.dict(exclude_none=True)
3150
+ params_dict['url'] = url
3151
+ params_dict['origin'] = f"python-sdk@{version}"
3152
+
3153
+ # Make request
3154
+ headers = self._prepare_headers(idempotency_key)
3155
+ response = await self._async_post_request(
3156
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3157
+
3158
+ if response.status_code == 200:
3159
+ try:
3160
+ id = response.json().get('id')
3161
+ except:
3162
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3163
+ return self._monitor_job_status(id, headers, poll_interval)
3164
+ else:
3165
+ self._handle_error(response, 'start crawl job')
3166
+
3167
+
3168
+ async def async_crawl_url(
3169
+ self,
3170
+ url: str,
3171
+ *,
3172
+ include_paths: Optional[List[str]] = None,
3173
+ exclude_paths: Optional[List[str]] = None,
3174
+ max_depth: Optional[int] = None,
3175
+ max_discovery_depth: Optional[int] = None,
3176
+ limit: Optional[int] = None,
3177
+ allow_backward_links: Optional[bool] = None,
3178
+ allow_external_links: Optional[bool] = None,
3179
+ ignore_sitemap: Optional[bool] = None,
3180
+ scrape_options: Optional[CommonOptions] = None,
3181
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3182
+ deduplicate_similar_urls: Optional[bool] = None,
3183
+ ignore_query_parameters: Optional[bool] = None,
3184
+ regex_on_full_url: Optional[bool] = None,
3185
+ idempotency_key: Optional[str] = None,
3186
+ **kwargs
3187
+ ) -> CrawlResponse:
3188
+ """
3189
+ Start an asynchronous crawl job.
3190
+
3191
+ Args:
3192
+ url (str): Target URL to start crawling from
3193
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3194
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3195
+ max_depth (Optional[int]): Maximum crawl depth
3196
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3197
+ limit (Optional[int]): Maximum pages to crawl
3198
+ allow_backward_links (Optional[bool]): Follow parent directory links
3199
+ allow_external_links (Optional[bool]): Follow external domain links
3200
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3201
+ scrape_options (Optional[CommonOptions]): Page scraping configuration
3202
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3203
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3204
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3205
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3206
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3207
+ **kwargs: Additional parameters to pass to the API
3208
+
3209
+ Returns:
3210
+ CrawlResponse with:
3211
+ * success - Whether crawl started successfully
3212
+ * id - Unique identifier for the crawl job
3213
+ * url - Status check URL for the crawl
3214
+ * error - Error message if start failed
3215
+
3216
+ Raises:
3217
+ Exception: If crawl initiation fails
3218
+ """
3219
+ crawl_params = {}
3220
+
3221
+ # Add individual parameters
3222
+ if include_paths is not None:
3223
+ crawl_params['includePaths'] = include_paths
3224
+ if exclude_paths is not None:
3225
+ crawl_params['excludePaths'] = exclude_paths
3226
+ if max_depth is not None:
3227
+ crawl_params['maxDepth'] = max_depth
3228
+ if max_discovery_depth is not None:
3229
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3230
+ if limit is not None:
3231
+ crawl_params['limit'] = limit
3232
+ if allow_backward_links is not None:
3233
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3234
+ if allow_external_links is not None:
3235
+ crawl_params['allowExternalLinks'] = allow_external_links
3236
+ if ignore_sitemap is not None:
3237
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3238
+ if scrape_options is not None:
3239
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3240
+ if webhook is not None:
3241
+ crawl_params['webhook'] = webhook
3242
+ if deduplicate_similar_urls is not None:
3243
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3244
+ if ignore_query_parameters is not None:
3245
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3246
+ if regex_on_full_url is not None:
3247
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3248
+
3249
+ # Add any additional kwargs
3250
+ crawl_params.update(kwargs)
3251
+
3252
+ # Create final params object
3253
+ final_params = CrawlParams(**crawl_params)
3254
+ params_dict = final_params.dict(exclude_none=True)
3255
+ params_dict['url'] = url
3256
+ params_dict['origin'] = f"python-sdk@{version}"
3257
+
3258
+ # Make request
3259
+ headers = self._prepare_headers(idempotency_key)
3260
+ response = await self._async_post_request(
3261
+ f'{self.api_url}/v1/crawl',
3262
+ params_dict,
3263
+ headers
3264
+ )
3265
+
3266
+ if response.status_code == 200:
3267
+ try:
3268
+ return CrawlResponse(**response.json())
3269
+ except:
3270
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3271
+ else:
3272
+ self._handle_error(response, 'start crawl job')
3273
+
3274
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3275
+ """
3276
+ Check the status and results of an asynchronous crawl job.
3277
+
3278
+ Args:
3279
+ id (str): Unique identifier for the crawl job
3280
+
3281
+ Returns:
3282
+ CrawlStatusResponse containing:
3283
+ Status Information:
3284
+ * status - Current state (scraping/completed/failed/cancelled)
3285
+ * completed - Number of pages crawled
3286
+ * total - Total pages to crawl
3287
+ * creditsUsed - API credits consumed
3288
+ * expiresAt - Data expiration timestamp
3289
+
3290
+ Results:
3291
+ * data - List of crawled documents
3292
+ * next - URL for next page of results (if paginated)
3293
+ * success - Whether status check succeeded
3294
+ * error - Error message if failed
3295
+
3296
+ Raises:
3297
+ Exception: If status check fails
3298
+ """
3299
+ headers = self._prepare_headers()
3300
+ endpoint = f'/v1/crawl/{id}'
3301
+
3302
+ status_data = await self._async_get_request(
3303
+ f'{self.api_url}{endpoint}',
3304
+ headers
3305
+ )
3306
+
3307
+ if status_data['status'] == 'completed':
3308
+ if 'data' in status_data:
3309
+ data = status_data['data']
3310
+ while 'next' in status_data:
3311
+ if len(status_data['data']) == 0:
3312
+ break
3313
+ next_url = status_data.get('next')
3314
+ if not next_url:
3315
+ logger.warning("Expected 'next' URL is missing.")
3316
+ break
3317
+ next_data = await self._async_get_request(next_url, headers)
3318
+ data.extend(next_data.get('data', []))
3319
+ status_data = next_data
3320
+ status_data['data'] = data
3321
+
3322
+ response = {
3323
+ 'status': status_data.get('status'),
3324
+ 'total': status_data.get('total'),
3325
+ 'completed': status_data.get('completed'),
3326
+ 'creditsUsed': status_data.get('creditsUsed'),
3327
+ 'expiresAt': status_data.get('expiresAt'),
3328
+ 'data': status_data.get('data')
3329
+ }
3330
+
3331
+ if 'error' in status_data:
3332
+ response['error'] = status_data['error']
3333
+
3334
+ if 'next' in status_data:
3335
+ response['next'] = status_data['next']
3336
+
3337
+ return {
3338
+ 'success': False if 'error' in status_data else True,
3339
+ **response
3340
+ }
3341
+
3342
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3343
+ """
3344
+ Monitor the status of an asynchronous job until completion.
3345
+
3346
+ Args:
3347
+ id (str): The ID of the job to monitor
3348
+ headers (Dict[str, str]): Headers to include in status check requests
3349
+ poll_interval (int): Seconds between status checks (default: 2)
3350
+
3351
+ Returns:
3352
+ CrawlStatusResponse: The job results if completed successfully
3353
+
3354
+ Raises:
3355
+ Exception: If the job fails or an error occurs during status checks
3356
+ """
3357
+ while True:
3358
+ status_data = await self._async_get_request(
3359
+ f'{self.api_url}/v1/crawl/{id}',
3360
+ headers
3361
+ )
3362
+
3363
+ if status_data['status'] == 'completed':
3364
+ if 'data' in status_data:
3365
+ data = status_data['data']
3366
+ while 'next' in status_data:
3367
+ if len(status_data['data']) == 0:
3368
+ break
3369
+ next_url = status_data.get('next')
3370
+ if not next_url:
3371
+ logger.warning("Expected 'next' URL is missing.")
3372
+ break
3373
+ next_data = await self._async_get_request(next_url, headers)
3374
+ data.extend(next_data.get('data', []))
3375
+ status_data = next_data
3376
+ status_data['data'] = data
3377
+ return status_data
3378
+ else:
3379
+ raise Exception('Job completed but no data was returned')
3380
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3381
+ await asyncio.sleep(max(poll_interval, 2))
3382
+ else:
3383
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3384
+
3385
+ async def map_url(
3386
+ self,
3387
+ url: str,
3388
+ params: Optional[MapParams] = None) -> MapResponse:
3389
+ """
3390
+ Asynchronously map and discover links from a URL.
3391
+
3392
+ Args:
3393
+ url (str): Target URL to map
3394
+ params (Optional[MapParams]): See MapParams model:
3395
+ Discovery Options:
3396
+ * search - Filter pattern for URLs
3397
+ * ignoreSitemap - Skip sitemap.xml
3398
+ * includeSubdomains - Include subdomain links
3399
+ * sitemapOnly - Only use sitemap.xml
3400
+
3401
+ Limits:
3402
+ * limit - Max URLs to return
3403
+ * timeout - Request timeout (ms)
3404
+
3405
+ Returns:
3406
+ MapResponse with:
3407
+ * Discovered URLs
3408
+ * Success/error status
3409
+
3410
+ Raises:
3411
+ Exception: If mapping fails
3412
+ """
3413
+ headers = self._prepare_headers()
3414
+ json_data = {'url': url}
3415
+ if params:
3416
+ json_data.update(params)
3417
+ json_data['origin'] = f"python-sdk@{version}"
3418
+
3419
+ endpoint = f'/v1/map'
3420
+ response = await self._async_post_request(
3421
+ f'{self.api_url}{endpoint}',
3422
+ json_data,
3423
+ headers
3424
+ )
3425
+
3426
+ if response.get('success') and 'links' in response:
3427
+ return response
3428
+ elif 'error' in response:
3429
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3430
+ else:
3431
+ raise Exception(f'Failed to map URL. Error: {response}')
3432
+
3433
+ async def extract(
3434
+ self,
3435
+ urls: List[str],
3436
+ params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
3437
+ """
3438
+ Asynchronously extract structured information from URLs.
3439
+
3440
+ Args:
3441
+ urls (List[str]): URLs to extract from
3442
+ params (Optional[ExtractParams]): See ExtractParams model:
3443
+ Extraction Config:
3444
+ * prompt - Custom extraction prompt
3445
+ * schema - JSON schema/Pydantic model
3446
+ * systemPrompt - System context
3447
+
3448
+ Behavior Options:
3449
+ * allowExternalLinks - Follow external links
3450
+ * enableWebSearch - Enable web search
3451
+ * includeSubdomains - Include subdomains
3452
+ * showSources - Include source URLs
3453
+
3454
+ Scraping Options:
3455
+ * scrapeOptions - Page scraping config
3456
+
3457
+ Returns:
3458
+ ExtractResponse with:
3459
+ * Structured data matching schema
3460
+ * Source information if requested
3461
+ * Success/error status
3462
+
3463
+ Raises:
3464
+ ValueError: If prompt/schema missing or extraction fails
3465
+ """
3466
+ headers = self._prepare_headers()
3467
+
3468
+ if not params or (not params.get('prompt') and not params.get('schema')):
3469
+ raise ValueError("Either prompt or schema is required")
3470
+
3471
+ schema = params.get('schema')
3472
+ if schema:
3473
+ if hasattr(schema, 'model_json_schema'):
3474
+ schema = schema.model_json_schema()
3475
+
3476
+ request_data = {
3477
+ 'urls': urls,
3478
+ 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
3479
+ 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)),
3480
+ 'showSources': params.get('show_sources', params.get('showSources', False)),
3481
+ 'schema': schema,
3482
+ 'origin': f'python-sdk@{version}'
3483
+ }
3484
+
3485
+ if params.get('prompt'):
3486
+ request_data['prompt'] = params['prompt']
3487
+ if params.get('system_prompt'):
3488
+ request_data['systemPrompt'] = params['system_prompt']
3489
+ elif params.get('systemPrompt'):
3490
+ request_data['systemPrompt'] = params['systemPrompt']
3491
+
3492
+ response = await self._async_post_request(
3493
+ f'{self.api_url}/v1/extract',
3494
+ request_data,
3495
+ headers
3496
+ )
3497
+
3498
+ if response.get('success'):
3499
+ job_id = response.get('id')
3500
+ if not job_id:
3501
+ raise Exception('Job ID not returned from extract request.')
3502
+
3503
+ while True:
3504
+ status_data = await self._async_get_request(
3505
+ f'{self.api_url}/v1/extract/{job_id}',
3506
+ headers
3507
+ )
3508
+
3509
+ if status_data['status'] == 'completed':
3510
+ return status_data
3511
+ elif status_data['status'] in ['failed', 'cancelled']:
3512
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3513
+
3514
+ await asyncio.sleep(2)
3515
+ else:
3516
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3517
+
3518
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3519
+ """
3520
+ Check the status of an asynchronous batch scrape job.
3521
+
3522
+ Args:
3523
+ id (str): The ID of the batch scrape job
3524
+
3525
+ Returns:
3526
+ BatchScrapeStatusResponse containing:
3527
+ Status Information:
3528
+ * status - Current state (scraping/completed/failed/cancelled)
3529
+ * completed - Number of URLs scraped
3530
+ * total - Total URLs to scrape
3531
+ * creditsUsed - API credits consumed
3532
+ * expiresAt - Data expiration timestamp
3533
+
3534
+ Results:
3535
+ * data - List of scraped documents
3536
+ * next - URL for next page of results (if paginated)
3537
+ * success - Whether status check succeeded
3538
+ * error - Error message if failed
3539
+
3540
+ Raises:
3541
+ Exception: If status check fails
3542
+ """
3543
+ headers = self._prepare_headers()
3544
+ endpoint = f'/v1/batch/scrape/{id}'
3545
+
3546
+ status_data = await self._async_get_request(
3547
+ f'{self.api_url}{endpoint}',
3548
+ headers
3549
+ )
3550
+
3551
+ if status_data['status'] == 'completed':
3552
+ if 'data' in status_data:
3553
+ data = status_data['data']
3554
+ while 'next' in status_data:
3555
+ if len(status_data['data']) == 0:
3556
+ break
3557
+ next_url = status_data.get('next')
3558
+ if not next_url:
3559
+ logger.warning("Expected 'next' URL is missing.")
3560
+ break
3561
+ next_data = await self._async_get_request(next_url, headers)
3562
+ data.extend(next_data.get('data', []))
3563
+ status_data = next_data
3564
+ status_data['data'] = data
3565
+
3566
+ response = {
3567
+ 'status': status_data.get('status'),
3568
+ 'total': status_data.get('total'),
3569
+ 'completed': status_data.get('completed'),
3570
+ 'creditsUsed': status_data.get('creditsUsed'),
3571
+ 'expiresAt': status_data.get('expiresAt'),
3572
+ 'data': status_data.get('data')
3573
+ }
3574
+
3575
+ if 'error' in status_data:
3576
+ response['error'] = status_data['error']
3577
+
3578
+ if 'next' in status_data:
3579
+ response['next'] = status_data['next']
3580
+
3581
+ return {
3582
+ 'success': False if 'error' in status_data else True,
3583
+ **response
3584
+ }
3585
+
3586
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3587
+ """
3588
+ Get information about errors from an asynchronous batch scrape job.
3589
+
3590
+ Args:
3591
+ id (str): The ID of the batch scrape job
3592
+
3593
+ Returns:
3594
+ CrawlErrorsResponse containing:
3595
+ errors (List[Dict[str, str]]): List of errors with fields:
3596
+ * id (str): Error ID
3597
+ * timestamp (str): When the error occurred
3598
+ * url (str): URL that caused the error
3599
+ * error (str): Error message
3600
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3601
+
3602
+ Raises:
3603
+ Exception: If error check fails
3604
+ """
3605
+ headers = self._prepare_headers()
3606
+ return await self._async_get_request(
3607
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3608
+ headers
3609
+ )
3610
+
3611
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3612
+ """
3613
+ Get information about errors from an asynchronous crawl job.
3614
+
3615
+ Args:
3616
+ id (str): The ID of the crawl job
3617
+
3618
+ Returns:
3619
+ CrawlErrorsResponse containing:
3620
+ * errors (List[Dict[str, str]]): List of errors with fields:
3621
+ - id (str): Error ID
3622
+ - timestamp (str): When the error occurred
3623
+ - url (str): URL that caused the error
3624
+ - error (str): Error message
3625
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3626
+
3627
+ Raises:
3628
+ Exception: If error check fails
3629
+ """
3630
+ headers = self._prepare_headers()
3631
+ return await self._async_get_request(
3632
+ f'{self.api_url}/v1/crawl/{id}/errors',
3633
+ headers
3634
+ )
3635
+
3636
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3637
+ """
3638
+ Cancel an asynchronous crawl job.
3639
+
3640
+ Args:
3641
+ id (str): The ID of the crawl job to cancel
3642
+
3643
+ Returns:
3644
+ Dict[str, Any] containing:
3645
+ * success (bool): Whether cancellation was successful
3646
+ * error (str, optional): Error message if cancellation failed
3647
+
3648
+ Raises:
3649
+ Exception: If cancellation fails
3650
+ """
3651
+ headers = self._prepare_headers()
3652
+ async with aiohttp.ClientSession() as session:
3653
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3654
+ return await response.json()
3655
+
3656
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3657
+ """
3658
+ Check the status of an asynchronous extraction job.
3659
+
3660
+ Args:
3661
+ job_id (str): The ID of the extraction job
3662
+
3663
+ Returns:
3664
+ ExtractResponse[Any] with:
3665
+ * success (bool): Whether request succeeded
3666
+ * data (Optional[Any]): Extracted data matching schema
3667
+ * error (Optional[str]): Error message if any
3668
+ * warning (Optional[str]): Warning message if any
3669
+ * sources (Optional[List[str]]): Source URLs if requested
3670
+
3671
+ Raises:
3672
+ ValueError: If status check fails
3673
+ """
3674
+ headers = self._prepare_headers()
3675
+ try:
3676
+ return await self._async_get_request(
3677
+ f'{self.api_url}/v1/extract/{job_id}',
3678
+ headers
3679
+ )
3680
+ except Exception as e:
3681
+ raise ValueError(str(e))
3682
+
3683
+ async def async_extract(
3684
+ self,
3685
+ urls: Optional[List[str]] = None,
3686
+ *,
3687
+ prompt: Optional[str] = None,
3688
+ schema: Optional[Any] = None,
3689
+ system_prompt: Optional[str] = None,
3690
+ allow_external_links: Optional[bool] = False,
3691
+ enable_web_search: Optional[bool] = False,
3692
+ show_sources: Optional[bool] = False,
3693
+ agent: Optional[Dict[str, Any]] = None,
3694
+ idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
3695
+ """
3696
+ Initiate an asynchronous extraction job without waiting for completion.
3697
+
3698
+ Args:
3699
+ urls (Optional[List[str]]): URLs to extract from
3700
+ prompt (Optional[str]): Custom extraction prompt
3701
+ schema (Optional[Any]): JSON schema/Pydantic model
3702
+ system_prompt (Optional[str]): System context
3703
+ allow_external_links (Optional[bool]): Follow external links
3704
+ enable_web_search (Optional[bool]): Enable web search
3705
+ show_sources (Optional[bool]): Include source URLs
3706
+ agent (Optional[Dict[str, Any]]): Agent configuration
3707
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3708
+
3709
+ Returns:
3710
+ ExtractResponse[Any] with:
3711
+ * success (bool): Whether request succeeded
3712
+ * data (Optional[Any]): Extracted data matching schema
3713
+ * error (Optional[str]): Error message if any
3714
+
3715
+ Raises:
3716
+ ValueError: If job initiation fails
3717
+ """
3718
+ headers = self._prepare_headers(idempotency_key)
3719
+
3720
+ if not prompt and not schema:
3721
+ raise ValueError("Either prompt or schema is required")
3722
+
3723
+ if not urls and not prompt:
3724
+ raise ValueError("Either urls or prompt is required")
3725
+
3726
+ if schema:
3727
+ if hasattr(schema, 'model_json_schema'):
3728
+ schema = schema.model_json_schema()
3729
+
3730
+ request_data = {
3731
+ 'urls': urls or [],
3732
+ 'allowExternalLinks': allow_external_links,
3733
+ 'enableWebSearch': enable_web_search,
3734
+ 'showSources': show_sources,
3735
+ 'schema': schema,
3736
+ 'origin': f'python-sdk@{version}'
3737
+ }
3738
+
3739
+ if prompt:
3740
+ request_data['prompt'] = prompt
3741
+ if system_prompt:
3742
+ request_data['systemPrompt'] = system_prompt
3743
+ if agent:
3744
+ request_data['agent'] = agent
3745
+
3746
+ try:
3747
+ return await self._async_post_request(
3748
+ f'{self.api_url}/v1/extract',
3749
+ request_data,
3750
+ headers
3751
+ )
3752
+ except Exception as e:
3753
+ raise ValueError(str(e))
3754
+
3755
+ async def generate_llms_text(
3756
+ self,
3757
+ url: str,
3758
+ *,
3759
+ max_urls: Optional[int] = None,
3760
+ show_full_text: Optional[bool] = None,
3761
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3762
+ """
3763
+ Generate LLMs.txt for a given URL and monitor until completion.
3764
+
3765
+ Args:
3766
+ url (str): Target URL to generate LLMs.txt from
3767
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3768
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3769
+ experimental_stream (Optional[bool]): Enable experimental streaming
3770
+
3771
+ Returns:
3772
+ GenerateLLMsTextStatusResponse containing:
3773
+ * success (bool): Whether generation completed successfully
3774
+ * status (str): Status of generation (processing/completed/failed)
3775
+ * data (Dict[str, str], optional): Generated text with fields:
3776
+ - llmstxt (str): Generated LLMs.txt content
3777
+ - llmsfulltxt (str, optional): Full version if requested
3778
+ * error (str, optional): Error message if generation failed
3779
+ * expiresAt (str): When the generated data expires
3780
+
3781
+ Raises:
3782
+ Exception: If generation fails
3783
+ """
3784
+ params = {}
3785
+ if max_urls is not None:
3786
+ params['maxUrls'] = max_urls
3787
+ if show_full_text is not None:
3788
+ params['showFullText'] = show_full_text
3789
+ if experimental_stream is not None:
3790
+ params['__experimental_stream'] = experimental_stream
3791
+
3792
+ response = await self.async_generate_llms_text(
3793
+ url,
3794
+ max_urls=max_urls,
3795
+ show_full_text=show_full_text,
3796
+ experimental_stream=experimental_stream
3797
+ )
3798
+ if not response.get('success') or 'id' not in response:
3799
+ return response
3800
+
3801
+ job_id = response['id']
3802
+ while True:
3803
+ status = await self.check_generate_llms_text_status(job_id)
3804
+
3805
+ if status['status'] == 'completed':
3806
+ return status
3807
+ elif status['status'] == 'failed':
3808
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
3809
+ elif status['status'] != 'processing':
3810
+ break
3811
+
3812
+ await asyncio.sleep(2)
3813
+
3814
+ return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
3815
+
3816
+ async def async_generate_llms_text(
3817
+ self,
3818
+ url: str,
3819
+ *,
3820
+ max_urls: Optional[int] = None,
3821
+ show_full_text: Optional[bool] = None,
3822
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
3823
+ """
3824
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
3825
+
3826
+ Args:
3827
+ url (str): Target URL to generate LLMs.txt from
3828
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3829
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3830
+ experimental_stream (Optional[bool]): Enable experimental streaming
3831
+
3832
+ Returns:
3833
+ GenerateLLMsTextResponse containing:
3834
+ * success (bool): Whether job started successfully
3835
+ * id (str): Unique identifier for the job
3836
+ * error (str, optional): Error message if start failed
3837
+
3838
+ Raises:
3839
+ ValueError: If job initiation fails
3840
+ """
3841
+ params = {}
3842
+ if max_urls is not None:
3843
+ params['maxUrls'] = max_urls
3844
+ if show_full_text is not None:
3845
+ params['showFullText'] = show_full_text
3846
+ if experimental_stream is not None:
3847
+ params['__experimental_stream'] = experimental_stream
3848
+
3849
+ headers = self._prepare_headers()
3850
+ json_data = {'url': url, **params.dict(exclude_none=True)}
3851
+ json_data['origin'] = f"python-sdk@{version}"
3852
+
3853
+ try:
3854
+ return await self._async_post_request(
3855
+ f'{self.api_url}/v1/llmstxt',
3856
+ json_data,
3857
+ headers
3858
+ )
3859
+ except Exception as e:
3860
+ raise ValueError(str(e))
3861
+
3862
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
3863
+ """
3864
+ Check the status of an asynchronous LLMs.txt generation job.
3865
+
3866
+ Args:
3867
+ id (str): The ID of the generation job
3868
+
3869
+ Returns:
3870
+ GenerateLLMsTextStatusResponse containing:
3871
+ * success (bool): Whether generation completed successfully
3872
+ * status (str): Status of generation (processing/completed/failed)
3873
+ * data (Dict[str, str], optional): Generated text with fields:
3874
+ - llmstxt (str): Generated LLMs.txt content
3875
+ - llmsfulltxt (str, optional): Full version if requested
3876
+ * error (str, optional): Error message if generation failed
3877
+ * expiresAt (str): When the generated data expires
3878
+
3879
+ Raises:
3880
+ ValueError: If status check fails
3881
+ """
3882
+ headers = self._prepare_headers()
3883
+ try:
3884
+ return await self._async_get_request(
3885
+ f'{self.api_url}/v1/llmstxt/{id}',
3886
+ headers
3887
+ )
3888
+ except Exception as e:
3889
+ raise ValueError(str(e))
3890
+
3891
+ async def deep_research(
3892
+ self,
3893
+ query: str,
3894
+ *,
3895
+ max_depth: Optional[int] = None,
3896
+ time_limit: Optional[int] = None,
3897
+ max_urls: Optional[int] = None,
3898
+ analysis_prompt: Optional[str] = None,
3899
+ system_prompt: Optional[str] = None,
3900
+ __experimental_stream_steps: Optional[bool] = None,
3901
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
3902
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
3903
+ """
3904
+ Initiates a deep research operation on a given query and polls until completion.
3905
+
3906
+ Args:
3907
+ query (str): Research query or topic to investigate
3908
+ max_depth (Optional[int]): Maximum depth of research exploration
3909
+ time_limit (Optional[int]): Time limit in seconds for research
3910
+ max_urls (Optional[int]): Maximum number of URLs to process
3911
+ analysis_prompt (Optional[str]): Custom prompt for analysis
3912
+ system_prompt (Optional[str]): Custom system prompt
3913
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
3914
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
3915
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
3916
+
3917
+ Returns:
3918
+ DeepResearchStatusResponse containing:
3919
+ * success (bool): Whether research completed successfully
3920
+ * status (str): Current state (processing/completed/failed)
3921
+ * error (Optional[str]): Error message if failed
3922
+ * id (str): Unique identifier for the research job
3923
+ * data (Any): Research findings and analysis
3924
+ * sources (List[Dict]): List of discovered sources
3925
+ * activities (List[Dict]): Research progress log
3926
+ * summaries (List[str]): Generated research summaries
3927
+
3928
+ Raises:
3929
+ Exception: If research fails
3930
+ """
3931
+ research_params = {}
3932
+ if max_depth is not None:
3933
+ research_params['maxDepth'] = max_depth
3934
+ if time_limit is not None:
3935
+ research_params['timeLimit'] = time_limit
3936
+ if max_urls is not None:
3937
+ research_params['maxUrls'] = max_urls
3938
+ if analysis_prompt is not None:
3939
+ research_params['analysisPrompt'] = analysis_prompt
3940
+ if system_prompt is not None:
3941
+ research_params['systemPrompt'] = system_prompt
3942
+ if __experimental_stream_steps is not None:
3943
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
3944
+ research_params = DeepResearchParams(**research_params)
3945
+
3946
+ response = await self.async_deep_research(
3947
+ query,
3948
+ max_depth=max_depth,
3949
+ time_limit=time_limit,
3950
+ max_urls=max_urls,
3951
+ analysis_prompt=analysis_prompt,
3952
+ system_prompt=system_prompt
3953
+ )
3954
+ if not response.get('success') or 'id' not in response:
3955
+ return response
3956
+
3957
+ job_id = response['id']
3958
+ last_activity_count = 0
3959
+ last_source_count = 0
3960
+
3961
+ while True:
3962
+ status = await self.check_deep_research_status(job_id)
3963
+
3964
+ if on_activity and 'activities' in status:
3965
+ new_activities = status['activities'][last_activity_count:]
3966
+ for activity in new_activities:
3967
+ on_activity(activity)
3968
+ last_activity_count = len(status['activities'])
3969
+
3970
+ if on_source and 'sources' in status:
3971
+ new_sources = status['sources'][last_source_count:]
3972
+ for source in new_sources:
3973
+ on_source(source)
3974
+ last_source_count = len(status['sources'])
3975
+
3976
+ if status['status'] == 'completed':
3977
+ return status
3978
+ elif status['status'] == 'failed':
3979
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
3980
+ elif status['status'] != 'processing':
3981
+ break
3982
+
3983
+ await asyncio.sleep(2)
3984
+
3985
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
3986
+
3987
+ async def async_deep_research(
3988
+ self,
3989
+ query: str,
3990
+ *,
3991
+ max_depth: Optional[int] = None,
3992
+ time_limit: Optional[int] = None,
3993
+ max_urls: Optional[int] = None,
3994
+ analysis_prompt: Optional[str] = None,
3995
+ system_prompt: Optional[str] = None,
3996
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
3997
+ """
3998
+ Initiates an asynchronous deep research operation.
3999
+
4000
+ Args:
4001
+ query (str): Research query or topic to investigate
4002
+ max_depth (Optional[int]): Maximum depth of research exploration
4003
+ time_limit (Optional[int]): Time limit in seconds for research
4004
+ max_urls (Optional[int]): Maximum number of URLs to process
4005
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4006
+ system_prompt (Optional[str]): Custom system prompt
4007
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4008
+
4009
+ Returns:
4010
+ Dict[str, Any]: A response containing:
4011
+ * success (bool): Whether the research initiation was successful
4012
+ * id (str): The unique identifier for the research job
4013
+ * error (str, optional): Error message if initiation failed
4014
+
4015
+ Raises:
4016
+ Exception: If the research initiation fails.
4017
+ """
4018
+ research_params = {}
4019
+ if max_depth is not None:
4020
+ research_params['maxDepth'] = max_depth
4021
+ if time_limit is not None:
4022
+ research_params['timeLimit'] = time_limit
4023
+ if max_urls is not None:
4024
+ research_params['maxUrls'] = max_urls
4025
+ if analysis_prompt is not None:
4026
+ research_params['analysisPrompt'] = analysis_prompt
4027
+ if system_prompt is not None:
4028
+ research_params['systemPrompt'] = system_prompt
4029
+ if __experimental_stream_steps is not None:
4030
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4031
+ research_params = DeepResearchParams(**research_params)
4032
+
4033
+ headers = self._prepare_headers()
4034
+
4035
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4036
+ json_data['origin'] = f"python-sdk@{version}"
4037
+
4038
+ try:
4039
+ return await self._async_post_request(
4040
+ f'{self.api_url}/v1/deep-research',
4041
+ json_data,
4042
+ headers
4043
+ )
4044
+ except Exception as e:
4045
+ raise ValueError(str(e))
4046
+
4047
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4048
+ """
4049
+ Check the status of a deep research operation.
4050
+
4051
+ Args:
4052
+ id (str): The ID of the deep research operation.
4053
+
4054
+ Returns:
4055
+ DeepResearchResponse containing:
4056
+
4057
+ Status:
4058
+ * success - Whether research completed successfully
4059
+ * status - Current state (processing/completed/failed)
4060
+ * error - Error message if failed
4061
+
4062
+ Results:
4063
+ * id - Unique identifier for the research job
4064
+ * data - Research findings and analysis
4065
+ * sources - List of discovered sources
4066
+ * activities - Research progress log
4067
+ * summaries - Generated research summaries
4068
+
4069
+ Raises:
4070
+ Exception: If the status check fails.
4071
+ """
4072
+ headers = self._prepare_headers()
4073
+ try:
4074
+ return await self._async_get_request(
4075
+ f'{self.api_url}/v1/deep-research/{id}',
4076
+ headers
4077
+ )
4078
+ except Exception as e:
4079
+ raise ValueError(str(e))
4080
+
4081
+ async def search(
4082
+ self,
4083
+ query: str,
4084
+ *,
4085
+ limit: Optional[int] = None,
4086
+ tbs: Optional[str] = None,
4087
+ filter: Optional[str] = None,
4088
+ lang: Optional[str] = None,
4089
+ country: Optional[str] = None,
4090
+ location: Optional[str] = None,
4091
+ timeout: Optional[int] = None,
4092
+ scrape_options: Optional[CommonOptions] = None,
4093
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4094
+ **kwargs) -> SearchResponse:
4095
+ """
4096
+ Asynchronously search for content using Firecrawl.
4097
+
4098
+ Args:
4099
+ query (str): Search query string
4100
+ limit (Optional[int]): Max results (default: 5)
4101
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4102
+ filter (Optional[str]): Custom result filter
4103
+ lang (Optional[str]): Language code (default: "en")
4104
+ country (Optional[str]): Country code (default: "us")
4105
+ location (Optional[str]): Geo-targeting
4106
+ timeout (Optional[int]): Request timeout in milliseconds
4107
+ scrape_options (Optional[CommonOptions]): Result scraping configuration
4108
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4109
+ **kwargs: Additional keyword arguments for future compatibility
4110
+
4111
+ Returns:
4112
+ SearchResponse: Response containing:
4113
+ * success (bool): Whether request succeeded
4114
+ * data (List[FirecrawlDocument]): Search results
4115
+ * warning (Optional[str]): Warning message if any
4116
+ * error (Optional[str]): Error message if any
4117
+
4118
+ Raises:
4119
+ Exception: If search fails or response cannot be parsed
4120
+ """
4121
+ # Build search parameters
4122
+ search_params = {}
4123
+ if params:
4124
+ if isinstance(params, dict):
4125
+ search_params.update(params)
4126
+ else:
4127
+ search_params.update(params.dict(exclude_none=True))
4128
+
4129
+ # Add individual parameters
4130
+ if limit is not None:
4131
+ search_params['limit'] = limit
4132
+ if tbs is not None:
4133
+ search_params['tbs'] = tbs
4134
+ if filter is not None:
4135
+ search_params['filter'] = filter
4136
+ if lang is not None:
4137
+ search_params['lang'] = lang
4138
+ if country is not None:
4139
+ search_params['country'] = country
4140
+ if location is not None:
4141
+ search_params['location'] = location
4142
+ if timeout is not None:
4143
+ search_params['timeout'] = timeout
4144
+ if scrape_options is not None:
4145
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4146
+
4147
+ # Add any additional kwargs
4148
+ search_params.update(kwargs)
4149
+
4150
+ # Create final params object
4151
+ final_params = SearchParams(query=query, **search_params)
4152
+ params_dict = final_params.dict(exclude_none=True)
4153
+ params_dict['origin'] = f"python-sdk@{version}"
4154
+
4155
+ return await self._async_post_request(
4156
+ f"{self.api_url}/v1/search",
4157
+ params_dict,
4158
+ {"Authorization": f"Bearer {self.api_key}"}
4159
+ )
4160
+
4161
+ class AsyncCrawlWatcher(CrawlWatcher):
4162
+ """
4163
+ Async version of CrawlWatcher that properly handles async operations.
4164
+ """
4165
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4166
+ super().__init__(id, app)
4167
+
4168
+ async def connect(self) -> None:
4169
+ """
4170
+ Establishes async WebSocket connection and starts listening for messages.
4171
+ """
4172
+ async with websockets.connect(
4173
+ self.ws_url,
4174
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4175
+ ) as websocket:
4176
+ await self._listen(websocket)
4177
+
4178
+ async def _listen(self, websocket) -> None:
4179
+ """
4180
+ Listens for incoming WebSocket messages and handles them asynchronously.
4181
+
4182
+ Args:
4183
+ websocket: The WebSocket connection object
4184
+ """
4185
+ async for message in websocket:
4186
+ msg = json.loads(message)
4187
+ await self._handle_message(msg)
4188
+
4189
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4190
+ """
4191
+ Handles incoming WebSocket messages based on their type asynchronously.
4192
+
4193
+ Args:
4194
+ msg (Dict[str, Any]): The message to handle
4195
+ """
4196
+ if msg['type'] == 'done':
4197
+ self.status = 'completed'
4198
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4199
+ elif msg['type'] == 'error':
4200
+ self.status = 'failed'
4201
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4202
+ elif msg['type'] == 'catchup':
4203
+ self.status = msg['data']['status']
4204
+ self.data.extend(msg['data'].get('data', []))
4205
+ for doc in self.data:
4206
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4207
+ elif msg['type'] == 'document':
4208
+ self.data.append(msg['data'])
4209
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4210
+
4211
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4212
+ """
4213
+ Handle errors from async API responses.
4214
+ """
4215
+ try:
4216
+ error_data = await response.json()
4217
+ error_message = error_data.get('error', 'No error message provided.')
4218
+ error_details = error_data.get('details', 'No additional error details provided.')
4219
+ except:
4220
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4221
+
4222
+ # Use the app's method to get the error message
4223
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4224
+
4225
+ raise aiohttp.ClientError(message)
4226
+
4227
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4228
+ """
4229
+ Generate a standardized error message based on HTTP status code for async operations.
4230
+
4231
+ Args:
4232
+ status_code (int): The HTTP status code from the response
4233
+ action (str): Description of the action that was being performed
4234
+ error_message (str): The error message from the API response
4235
+ error_details (str): Additional error details from the API response
4236
+
4237
+ Returns:
4238
+ str: A formatted error message
4239
+ """
4240
+ return self._get_error_message(status_code, action, error_message, error_details)