firecrawl 2.4.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

@@ -0,0 +1,4384 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
47
+
48
+ logger : logging.Logger = logging.getLogger("firecrawl")
49
+
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+
100
+ class ChangeTrackingData(pydantic.BaseModel):
101
+ """
102
+ Data for the change tracking format.
103
+ """
104
+ previousScrapeAt: Optional[str] = None
105
+ changeStatus: str # "new" | "same" | "changed" | "removed"
106
+ visibility: str # "visible" | "hidden"
107
+ diff: Optional[Dict[str, Any]] = None
108
+ json: Optional[Any] = None
109
+
110
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
+ """Document retrieved or processed by Firecrawl."""
112
+ url: Optional[str] = None
113
+ markdown: Optional[str] = None
114
+ html: Optional[str] = None
115
+ rawHtml: Optional[str] = None
116
+ links: Optional[List[str]] = None
117
+ extract: Optional[T] = None
118
+ json: Optional[T] = None
119
+ screenshot: Optional[str] = None
120
+ metadata: Optional[Any] = None
121
+ actions: Optional[ActionsResult] = None
122
+ title: Optional[str] = None # v1 search only
123
+ description: Optional[str] = None # v1 search only
124
+ changeTracking: Optional[ChangeTrackingData] = None
125
+
126
+ class LocationConfig(pydantic.BaseModel):
127
+ """Location configuration for scraping."""
128
+ country: Optional[str] = None
129
+ languages: Optional[List[str]] = None
130
+
131
+ class WebhookConfig(pydantic.BaseModel):
132
+ """Configuration for webhooks."""
133
+ url: str
134
+ headers: Optional[Dict[str, str]] = None
135
+ metadata: Optional[Dict[str, str]] = None
136
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
+
138
+ class ChangeTrackingOptions(pydantic.BaseModel):
139
+ """Configuration for change tracking."""
140
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
141
+ schema: Optional[Any] = None
142
+ prompt: Optional[str] = None
143
+
144
+ class ScrapeOptions(pydantic.BaseModel):
145
+ """Parameters for scraping operations."""
146
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
147
+ headers: Optional[Dict[str, str]] = None
148
+ includeTags: Optional[List[str]] = None
149
+ excludeTags: Optional[List[str]] = None
150
+ onlyMainContent: Optional[bool] = None
151
+ waitFor: Optional[int] = None
152
+ timeout: Optional[int] = None
153
+ location: Optional[LocationConfig] = None
154
+ mobile: Optional[bool] = None
155
+ skipTlsVerification: Optional[bool] = None
156
+ removeBase64Images: Optional[bool] = None
157
+ blockAds: Optional[bool] = None
158
+ proxy: Optional[Literal["basic", "stealth"]] = None
159
+ changeTrackingOptions: Optional[ChangeTrackingOptions] = None
160
+
161
+ class WaitAction(pydantic.BaseModel):
162
+ """Wait action to perform during scraping."""
163
+ type: Literal["wait"]
164
+ milliseconds: int
165
+ selector: Optional[str] = None
166
+
167
+ class ScreenshotAction(pydantic.BaseModel):
168
+ """Screenshot action to perform during scraping."""
169
+ type: Literal["screenshot"]
170
+ fullPage: Optional[bool] = None
171
+
172
+ class ClickAction(pydantic.BaseModel):
173
+ """Click action to perform during scraping."""
174
+ type: Literal["click"]
175
+ selector: str
176
+
177
+ class WriteAction(pydantic.BaseModel):
178
+ """Write action to perform during scraping."""
179
+ type: Literal["write"]
180
+ text: str
181
+
182
+ class PressAction(pydantic.BaseModel):
183
+ """Press action to perform during scraping."""
184
+ type: Literal["press"]
185
+ key: str
186
+
187
+ class ScrollAction(pydantic.BaseModel):
188
+ """Scroll action to perform during scraping."""
189
+ type: Literal["scroll"]
190
+ direction: Literal["up", "down"]
191
+ selector: Optional[str] = None
192
+
193
+ class ScrapeAction(pydantic.BaseModel):
194
+ """Scrape action to perform during scraping."""
195
+ type: Literal["scrape"]
196
+
197
+ class ExecuteJavascriptAction(pydantic.BaseModel):
198
+ """Execute javascript action to perform during scraping."""
199
+ type: Literal["executeJavascript"]
200
+ script: str
201
+
202
+
203
+ class ExtractAgent(pydantic.BaseModel):
204
+ """Configuration for the agent in extract operations."""
205
+ model: Literal["FIRE-1"] = "FIRE-1"
206
+
207
+ class JsonConfig(pydantic.BaseModel):
208
+ """Configuration for extraction."""
209
+ prompt: Optional[str] = None
210
+ schema: Optional[Any] = None
211
+ systemPrompt: Optional[str] = None
212
+ agent: Optional[ExtractAgent] = None
213
+
214
+ class ScrapeParams(ScrapeOptions):
215
+ """Parameters for scraping operations."""
216
+ extract: Optional[JsonConfig] = None
217
+ jsonOptions: Optional[JsonConfig] = None
218
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
219
+ agent: Optional[AgentOptions] = None
220
+ webhook: Optional[WebhookConfig] = None
221
+
222
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
223
+ """Response from scraping operations."""
224
+ success: bool = True
225
+ warning: Optional[str] = None
226
+ error: Optional[str] = None
227
+
228
+ class BatchScrapeResponse(pydantic.BaseModel):
229
+ """Response from batch scrape operations."""
230
+ id: Optional[str] = None
231
+ url: Optional[str] = None
232
+ success: bool = True
233
+ error: Optional[str] = None
234
+ invalidURLs: Optional[List[str]] = None
235
+
236
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
237
+ """Response from batch scrape status checks."""
238
+ success: bool = True
239
+ status: Literal["scraping", "completed", "failed", "cancelled"]
240
+ completed: int
241
+ total: int
242
+ creditsUsed: int
243
+ expiresAt: datetime
244
+ next: Optional[str] = None
245
+ data: List[FirecrawlDocument]
246
+
247
+ class CrawlParams(pydantic.BaseModel):
248
+ """Parameters for crawling operations."""
249
+ includePaths: Optional[List[str]] = None
250
+ excludePaths: Optional[List[str]] = None
251
+ maxDepth: Optional[int] = None
252
+ maxDiscoveryDepth: Optional[int] = None
253
+ limit: Optional[int] = None
254
+ allowBackwardLinks: Optional[bool] = None
255
+ allowExternalLinks: Optional[bool] = None
256
+ ignoreSitemap: Optional[bool] = None
257
+ scrapeOptions: Optional[ScrapeOptions] = None
258
+ webhook: Optional[Union[str, WebhookConfig]] = None
259
+ deduplicateSimilarURLs: Optional[bool] = None
260
+ ignoreQueryParameters: Optional[bool] = None
261
+ regexOnFullURL: Optional[bool] = None
262
+
263
+ class CrawlResponse(pydantic.BaseModel):
264
+ """Response from crawling operations."""
265
+ id: Optional[str] = None
266
+ url: Optional[str] = None
267
+ success: bool = True
268
+ error: Optional[str] = None
269
+
270
+ class CrawlStatusResponse(pydantic.BaseModel):
271
+ """Response from crawl status checks."""
272
+ success: bool = True
273
+ status: Literal["scraping", "completed", "failed", "cancelled"]
274
+ completed: int
275
+ total: int
276
+ creditsUsed: int
277
+ expiresAt: datetime
278
+ next: Optional[str] = None
279
+ data: List[FirecrawlDocument]
280
+
281
+ class CrawlErrorsResponse(pydantic.BaseModel):
282
+ """Response from crawl/batch scrape error monitoring."""
283
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
284
+ robotsBlocked: List[str]
285
+
286
+ class MapParams(pydantic.BaseModel):
287
+ """Parameters for mapping operations."""
288
+ search: Optional[str] = None
289
+ ignoreSitemap: Optional[bool] = None
290
+ includeSubdomains: Optional[bool] = None
291
+ sitemapOnly: Optional[bool] = None
292
+ limit: Optional[int] = None
293
+ timeout: Optional[int] = None
294
+
295
+ class MapResponse(pydantic.BaseModel):
296
+ """Response from mapping operations."""
297
+ success: bool = True
298
+ links: Optional[List[str]] = None
299
+ error: Optional[str] = None
300
+
301
+ class ExtractParams(pydantic.BaseModel):
302
+ """Parameters for extracting information from URLs."""
303
+ prompt: Optional[str] = None
304
+ schema: Optional[Any] = None
305
+ systemPrompt: Optional[str] = None
306
+ allowExternalLinks: Optional[bool] = None
307
+ enableWebSearch: Optional[bool] = None
308
+ includeSubdomains: Optional[bool] = None
309
+ origin: Optional[str] = None
310
+ showSources: Optional[bool] = None
311
+ scrapeOptions: Optional[ScrapeOptions] = None
312
+
313
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
314
+ """Response from extract operations."""
315
+ id: Optional[str] = None
316
+ status: Optional[Literal["processing", "completed", "failed"]] = None
317
+ expiresAt: Optional[datetime] = None
318
+ success: bool = True
319
+ data: Optional[T] = None
320
+ error: Optional[str] = None
321
+ warning: Optional[str] = None
322
+ sources: Optional[List[str]] = None
323
+
324
+ class SearchParams(pydantic.BaseModel):
325
+ query: str
326
+ limit: Optional[int] = 5
327
+ tbs: Optional[str] = None
328
+ filter: Optional[str] = None
329
+ lang: Optional[str] = "en"
330
+ country: Optional[str] = "us"
331
+ location: Optional[str] = None
332
+ origin: Optional[str] = "api"
333
+ timeout: Optional[int] = 60000
334
+ scrapeOptions: Optional[ScrapeOptions] = None
335
+
336
+ class SearchResponse(pydantic.BaseModel):
337
+ """Response from search operations."""
338
+ success: bool = True
339
+ data: List[FirecrawlDocument]
340
+ warning: Optional[str] = None
341
+ error: Optional[str] = None
342
+
343
+ class GenerateLLMsTextParams(pydantic.BaseModel):
344
+ """
345
+ Parameters for the LLMs.txt generation operation.
346
+ """
347
+ maxUrls: Optional[int] = 10
348
+ showFullText: Optional[bool] = False
349
+ __experimental_stream: Optional[bool] = None
350
+
351
+ class DeepResearchParams(pydantic.BaseModel):
352
+ """
353
+ Parameters for the deep research operation.
354
+ """
355
+ maxDepth: Optional[int] = 7
356
+ timeLimit: Optional[int] = 270
357
+ maxUrls: Optional[int] = 20
358
+ analysisPrompt: Optional[str] = None
359
+ systemPrompt: Optional[str] = None
360
+ __experimental_streamSteps: Optional[bool] = None
361
+
362
+ class DeepResearchResponse(pydantic.BaseModel):
363
+ """
364
+ Response from the deep research operation.
365
+ """
366
+ success: bool
367
+ id: str
368
+ error: Optional[str] = None
369
+
370
+ class DeepResearchStatusResponse(pydantic.BaseModel):
371
+ """
372
+ Status response from the deep research operation.
373
+ """
374
+ success: bool
375
+ data: Optional[Dict[str, Any]] = None
376
+ status: str
377
+ error: Optional[str] = None
378
+ expiresAt: str
379
+ currentDepth: int
380
+ maxDepth: int
381
+ activities: List[Dict[str, Any]]
382
+ sources: List[Dict[str, Any]]
383
+ summaries: List[str]
384
+
385
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
386
+ """Response from LLMs.txt generation operations."""
387
+ success: bool = True
388
+ id: str
389
+ error: Optional[str] = None
390
+
391
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
392
+ llmstxt: str
393
+ llmsfulltxt: Optional[str] = None
394
+
395
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
396
+ """Status response from LLMs.txt generation operations."""
397
+ success: bool = True
398
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
399
+ status: Literal["processing", "completed", "failed"]
400
+ error: Optional[str] = None
401
+ expiresAt: str
402
+
403
+ class SearchResponse(pydantic.BaseModel):
404
+ """
405
+ Response from the search operation.
406
+ """
407
+ success: bool
408
+ data: List[Dict[str, Any]]
409
+ warning: Optional[str] = None
410
+ error: Optional[str] = None
411
+
412
+ class ExtractParams(pydantic.BaseModel):
413
+ """
414
+ Parameters for the extract operation.
415
+ """
416
+ prompt: Optional[str] = None
417
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
418
+ system_prompt: Optional[str] = None
419
+ allow_external_links: Optional[bool] = False
420
+ enable_web_search: Optional[bool] = False
421
+ # Just for backwards compatibility
422
+ enableWebSearch: Optional[bool] = False
423
+ show_sources: Optional[bool] = False
424
+ agent: Optional[Dict[str, Any]] = None
425
+
426
+ class FirecrawlApp:
427
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
428
+ """
429
+ Initialize the FirecrawlApp instance with API key, API URL.
430
+
431
+ Args:
432
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
433
+ api_url (Optional[str]): Base URL for the Firecrawl API.
434
+ """
435
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
436
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
437
+
438
+ # Only require API key when using cloud service
439
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
440
+ logger.warning("No API key provided for cloud service")
441
+ raise ValueError('No API key provided')
442
+
443
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
444
+
445
+ def scrape_url(
446
+ self,
447
+ url: str,
448
+ *,
449
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
450
+ include_tags: Optional[List[str]] = None,
451
+ exclude_tags: Optional[List[str]] = None,
452
+ only_main_content: Optional[bool] = None,
453
+ wait_for: Optional[int] = None,
454
+ timeout: Optional[int] = None,
455
+ location: Optional[LocationConfig] = None,
456
+ mobile: Optional[bool] = None,
457
+ skip_tls_verification: Optional[bool] = None,
458
+ remove_base64_images: Optional[bool] = None,
459
+ block_ads: Optional[bool] = None,
460
+ proxy: Optional[Literal["basic", "stealth"]] = None,
461
+ extract: Optional[JsonConfig] = None,
462
+ json_options: Optional[JsonConfig] = None,
463
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
464
+ change_tracking_options: Optional[ChangeTrackingOptions] = None,
465
+ **kwargs) -> ScrapeResponse[Any]:
466
+ """
467
+ Scrape and extract content from a URL.
468
+
469
+ Args:
470
+ url (str): Target URL to scrape
471
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
472
+ include_tags (Optional[List[str]]): HTML tags to include
473
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
474
+ only_main_content (Optional[bool]): Extract main content only
475
+ wait_for (Optional[int]): Wait for a specific element to appear
476
+ timeout (Optional[int]): Request timeout (ms)
477
+ location (Optional[LocationConfig]): Location configuration
478
+ mobile (Optional[bool]): Use mobile user agent
479
+ skip_tls_verification (Optional[bool]): Skip TLS verification
480
+ remove_base64_images (Optional[bool]): Remove base64 images
481
+ block_ads (Optional[bool]): Block ads
482
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
483
+ extract (Optional[JsonConfig]): Content extraction settings
484
+ json_options (Optional[JsonConfig]): JSON extraction settings
485
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
486
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
487
+
488
+
489
+ Returns:
490
+ ScrapeResponse with:
491
+ * Requested content formats
492
+ * Page metadata
493
+ * Extraction results
494
+ * Success/error status
495
+
496
+ Raises:
497
+ Exception: If scraping fails
498
+ """
499
+ headers = self._prepare_headers()
500
+
501
+ # Build scrape parameters
502
+ scrape_params = {
503
+ 'url': url,
504
+ 'origin': f"python-sdk@{version}"
505
+ }
506
+
507
+ # Add optional parameters if provided
508
+ if formats:
509
+ scrape_params['formats'] = formats
510
+ if include_tags:
511
+ scrape_params['includeTags'] = include_tags
512
+ if exclude_tags:
513
+ scrape_params['excludeTags'] = exclude_tags
514
+ if only_main_content is not None:
515
+ scrape_params['onlyMainContent'] = only_main_content
516
+ if wait_for:
517
+ scrape_params['waitFor'] = wait_for
518
+ if timeout:
519
+ scrape_params['timeout'] = timeout
520
+ if location:
521
+ scrape_params['location'] = location.dict(exclude_none=True)
522
+ if mobile is not None:
523
+ scrape_params['mobile'] = mobile
524
+ if skip_tls_verification is not None:
525
+ scrape_params['skipTlsVerification'] = skip_tls_verification
526
+ if remove_base64_images is not None:
527
+ scrape_params['removeBase64Images'] = remove_base64_images
528
+ if block_ads is not None:
529
+ scrape_params['blockAds'] = block_ads
530
+ if proxy:
531
+ scrape_params['proxy'] = proxy
532
+ if extract:
533
+ if hasattr(extract.schema, 'schema'):
534
+ extract.schema = extract.schema.schema()
535
+ scrape_params['extract'] = extract.dict(exclude_none=True)
536
+ if json_options:
537
+ if hasattr(json_options.schema, 'schema'):
538
+ json_options.schema = json_options.schema.schema()
539
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
540
+ if actions:
541
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
542
+ if change_tracking_options:
543
+ scrape_params['changeTrackingOptions'] = change_tracking_options.dict(exclude_none=True)
544
+
545
+ scrape_params.update(kwargs)
546
+
547
+ # Make request
548
+ response = requests.post(
549
+ f'{self.api_url}/v1/scrape',
550
+ headers=headers,
551
+ json=scrape_params,
552
+ timeout=(timeout + 5000 if timeout else None)
553
+ )
554
+
555
+ if response.status_code == 200:
556
+ try:
557
+ response_json = response.json()
558
+ if response_json.get('success') and 'data' in response_json:
559
+ return ScrapeResponse(**response_json['data'])
560
+ elif "error" in response_json:
561
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
562
+ else:
563
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
564
+ except ValueError:
565
+ raise Exception('Failed to parse Firecrawl response as JSON.')
566
+ else:
567
+ self._handle_error(response, 'scrape URL')
568
+
569
+ def search(
570
+ self,
571
+ query: str,
572
+ *,
573
+ limit: Optional[int] = None,
574
+ tbs: Optional[str] = None,
575
+ filter: Optional[str] = None,
576
+ lang: Optional[str] = None,
577
+ country: Optional[str] = None,
578
+ location: Optional[str] = None,
579
+ timeout: Optional[int] = None,
580
+ scrape_options: Optional[ScrapeOptions] = None,
581
+ **kwargs) -> SearchResponse:
582
+ """
583
+ Search for content using Firecrawl.
584
+
585
+ Args:
586
+ query (str): Search query string
587
+ limit (Optional[int]): Max results (default: 5)
588
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
589
+ filter (Optional[str]): Custom result filter
590
+ lang (Optional[str]): Language code (default: "en")
591
+ country (Optional[str]): Country code (default: "us")
592
+ location (Optional[str]): Geo-targeting
593
+ timeout (Optional[int]): Request timeout in milliseconds
594
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
595
+ **kwargs: Additional keyword arguments for future compatibility
596
+
597
+ Returns:
598
+ SearchResponse: Response containing:
599
+ * success (bool): Whether request succeeded
600
+ * data (List[FirecrawlDocument]): Search results
601
+ * warning (Optional[str]): Warning message if any
602
+ * error (Optional[str]): Error message if any
603
+
604
+ Raises:
605
+ Exception: If search fails or response cannot be parsed
606
+ """
607
+ # Validate any additional kwargs
608
+ self._validate_kwargs(kwargs, "search")
609
+
610
+ # Build search parameters
611
+ search_params = {}
612
+
613
+ # Add individual parameters
614
+ if limit is not None:
615
+ search_params['limit'] = limit
616
+ if tbs is not None:
617
+ search_params['tbs'] = tbs
618
+ if filter is not None:
619
+ search_params['filter'] = filter
620
+ if lang is not None:
621
+ search_params['lang'] = lang
622
+ if country is not None:
623
+ search_params['country'] = country
624
+ if location is not None:
625
+ search_params['location'] = location
626
+ if timeout is not None:
627
+ search_params['timeout'] = timeout
628
+ if scrape_options is not None:
629
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
630
+
631
+ # Add any additional kwargs
632
+ search_params.update(kwargs)
633
+
634
+ # Create final params object
635
+ final_params = SearchParams(query=query, **search_params)
636
+ params_dict = final_params.dict(exclude_none=True)
637
+ params_dict['origin'] = f"python-sdk@{version}"
638
+
639
+ # Make request
640
+ response = requests.post(
641
+ f"{self.api_url}/v1/search",
642
+ headers={"Authorization": f"Bearer {self.api_key}"},
643
+ json=params_dict
644
+ )
645
+
646
+ if response.status_code == 200:
647
+ try:
648
+ response_json = response.json()
649
+ if response_json.get('success') and 'data' in response_json:
650
+ return SearchResponse(**response_json)
651
+ elif "error" in response_json:
652
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
653
+ else:
654
+ raise Exception(f'Search failed. Error: {response_json}')
655
+ except ValueError:
656
+ raise Exception('Failed to parse Firecrawl response as JSON.')
657
+ else:
658
+ self._handle_error(response, 'search')
659
+
660
+ def crawl_url(
661
+ self,
662
+ url: str,
663
+ *,
664
+ include_paths: Optional[List[str]] = None,
665
+ exclude_paths: Optional[List[str]] = None,
666
+ max_depth: Optional[int] = None,
667
+ max_discovery_depth: Optional[int] = None,
668
+ limit: Optional[int] = None,
669
+ allow_backward_links: Optional[bool] = None,
670
+ allow_external_links: Optional[bool] = None,
671
+ ignore_sitemap: Optional[bool] = None,
672
+ scrape_options: Optional[ScrapeOptions] = None,
673
+ webhook: Optional[Union[str, WebhookConfig]] = None,
674
+ deduplicate_similar_urls: Optional[bool] = None,
675
+ ignore_query_parameters: Optional[bool] = None,
676
+ regex_on_full_url: Optional[bool] = None,
677
+ poll_interval: Optional[int] = 2,
678
+ idempotency_key: Optional[str] = None,
679
+ **kwargs
680
+ ) -> CrawlStatusResponse:
681
+ """
682
+ Crawl a website starting from a URL.
683
+
684
+ Args:
685
+ url (str): Target URL to start crawling from
686
+ include_paths (Optional[List[str]]): Patterns of URLs to include
687
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
688
+ max_depth (Optional[int]): Maximum crawl depth
689
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
690
+ limit (Optional[int]): Maximum pages to crawl
691
+ allow_backward_links (Optional[bool]): Follow parent directory links
692
+ allow_external_links (Optional[bool]): Follow external domain links
693
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
694
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
695
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
696
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
697
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
698
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
699
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
700
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
701
+ **kwargs: Additional parameters to pass to the API
702
+
703
+ Returns:
704
+ CrawlStatusResponse with:
705
+ * Crawling status and progress
706
+ * Crawled page contents
707
+ * Success/error information
708
+
709
+ Raises:
710
+ Exception: If crawl fails
711
+ """
712
+ # Validate any additional kwargs
713
+ self._validate_kwargs(kwargs, "crawl_url")
714
+
715
+ crawl_params = {}
716
+
717
+ # Add individual parameters
718
+ if include_paths is not None:
719
+ crawl_params['includePaths'] = include_paths
720
+ if exclude_paths is not None:
721
+ crawl_params['excludePaths'] = exclude_paths
722
+ if max_depth is not None:
723
+ crawl_params['maxDepth'] = max_depth
724
+ if max_discovery_depth is not None:
725
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
726
+ if limit is not None:
727
+ crawl_params['limit'] = limit
728
+ if allow_backward_links is not None:
729
+ crawl_params['allowBackwardLinks'] = allow_backward_links
730
+ if allow_external_links is not None:
731
+ crawl_params['allowExternalLinks'] = allow_external_links
732
+ if ignore_sitemap is not None:
733
+ crawl_params['ignoreSitemap'] = ignore_sitemap
734
+ if scrape_options is not None:
735
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
736
+ if webhook is not None:
737
+ crawl_params['webhook'] = webhook
738
+ if deduplicate_similar_urls is not None:
739
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
740
+ if ignore_query_parameters is not None:
741
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
742
+ if regex_on_full_url is not None:
743
+ crawl_params['regexOnFullURL'] = regex_on_full_url
744
+
745
+ # Add any additional kwargs
746
+ crawl_params.update(kwargs)
747
+
748
+ # Create final params object
749
+ final_params = CrawlParams(**crawl_params)
750
+ params_dict = final_params.dict(exclude_none=True)
751
+ params_dict['url'] = url
752
+ params_dict['origin'] = f"python-sdk@{version}"
753
+
754
+ # Make request
755
+ headers = self._prepare_headers(idempotency_key)
756
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
757
+
758
+ if response.status_code == 200:
759
+ try:
760
+ id = response.json().get('id')
761
+ except:
762
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
763
+ return self._monitor_job_status(id, headers, poll_interval)
764
+ else:
765
+ self._handle_error(response, 'start crawl job')
766
+
767
+ def async_crawl_url(
768
+ self,
769
+ url: str,
770
+ *,
771
+ include_paths: Optional[List[str]] = None,
772
+ exclude_paths: Optional[List[str]] = None,
773
+ max_depth: Optional[int] = None,
774
+ max_discovery_depth: Optional[int] = None,
775
+ limit: Optional[int] = None,
776
+ allow_backward_links: Optional[bool] = None,
777
+ allow_external_links: Optional[bool] = None,
778
+ ignore_sitemap: Optional[bool] = None,
779
+ scrape_options: Optional[ScrapeOptions] = None,
780
+ webhook: Optional[Union[str, WebhookConfig]] = None,
781
+ deduplicate_similar_urls: Optional[bool] = None,
782
+ ignore_query_parameters: Optional[bool] = None,
783
+ regex_on_full_url: Optional[bool] = None,
784
+ idempotency_key: Optional[str] = None,
785
+ **kwargs
786
+ ) -> CrawlResponse:
787
+ """
788
+ Start an asynchronous crawl job.
789
+
790
+ Args:
791
+ url (str): Target URL to start crawling from
792
+ include_paths (Optional[List[str]]): Patterns of URLs to include
793
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
794
+ max_depth (Optional[int]): Maximum crawl depth
795
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
796
+ limit (Optional[int]): Maximum pages to crawl
797
+ allow_backward_links (Optional[bool]): Follow parent directory links
798
+ allow_external_links (Optional[bool]): Follow external domain links
799
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
800
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
801
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
802
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
803
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
804
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
805
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
806
+ **kwargs: Additional parameters to pass to the API
807
+
808
+ Returns:
809
+ CrawlResponse with:
810
+ * success - Whether crawl started successfully
811
+ * id - Unique identifier for the crawl job
812
+ * url - Status check URL for the crawl
813
+ * error - Error message if start failed
814
+
815
+ Raises:
816
+ Exception: If crawl initiation fails
817
+ """
818
+ # Validate any additional kwargs
819
+ self._validate_kwargs(kwargs, "async_crawl_url")
820
+
821
+ crawl_params = {}
822
+
823
+ # Add individual parameters
824
+ if include_paths is not None:
825
+ crawl_params['includePaths'] = include_paths
826
+ if exclude_paths is not None:
827
+ crawl_params['excludePaths'] = exclude_paths
828
+ if max_depth is not None:
829
+ crawl_params['maxDepth'] = max_depth
830
+ if max_discovery_depth is not None:
831
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
832
+ if limit is not None:
833
+ crawl_params['limit'] = limit
834
+ if allow_backward_links is not None:
835
+ crawl_params['allowBackwardLinks'] = allow_backward_links
836
+ if allow_external_links is not None:
837
+ crawl_params['allowExternalLinks'] = allow_external_links
838
+ if ignore_sitemap is not None:
839
+ crawl_params['ignoreSitemap'] = ignore_sitemap
840
+ if scrape_options is not None:
841
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
842
+ if webhook is not None:
843
+ crawl_params['webhook'] = webhook
844
+ if deduplicate_similar_urls is not None:
845
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
846
+ if ignore_query_parameters is not None:
847
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
848
+ if regex_on_full_url is not None:
849
+ crawl_params['regexOnFullURL'] = regex_on_full_url
850
+
851
+ # Add any additional kwargs
852
+ crawl_params.update(kwargs)
853
+
854
+ # Create final params object
855
+ final_params = CrawlParams(**crawl_params)
856
+ params_dict = final_params.dict(exclude_none=True)
857
+ params_dict['url'] = url
858
+ params_dict['origin'] = f"python-sdk@{version}"
859
+
860
+ # Make request
861
+ headers = self._prepare_headers(idempotency_key)
862
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
863
+
864
+ if response.status_code == 200:
865
+ try:
866
+ return CrawlResponse(**response.json())
867
+ except:
868
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
869
+ else:
870
+ self._handle_error(response, 'start crawl job')
871
+
872
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
873
+ """
874
+ Check the status and results of a crawl job.
875
+
876
+ Args:
877
+ id: Unique identifier for the crawl job
878
+
879
+ Returns:
880
+ CrawlStatusResponse containing:
881
+
882
+ Status Information:
883
+ * status - Current state (scraping/completed/failed/cancelled)
884
+ * completed - Number of pages crawled
885
+ * total - Total pages to crawl
886
+ * creditsUsed - API credits consumed
887
+ * expiresAt - Data expiration timestamp
888
+
889
+ Results:
890
+ * data - List of crawled documents
891
+ * next - URL for next page of results (if paginated)
892
+ * success - Whether status check succeeded
893
+ * error - Error message if failed
894
+
895
+ Raises:
896
+ Exception: If status check fails
897
+ """
898
+ endpoint = f'/v1/crawl/{id}'
899
+
900
+ headers = self._prepare_headers()
901
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
902
+ if response.status_code == 200:
903
+ try:
904
+ status_data = response.json()
905
+ except:
906
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
907
+ if status_data['status'] == 'completed':
908
+ if 'data' in status_data:
909
+ data = status_data['data']
910
+ while 'next' in status_data:
911
+ if len(status_data['data']) == 0:
912
+ break
913
+ next_url = status_data.get('next')
914
+ if not next_url:
915
+ logger.warning("Expected 'next' URL is missing.")
916
+ break
917
+ try:
918
+ status_response = self._get_request(next_url, headers)
919
+ if status_response.status_code != 200:
920
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
921
+ break
922
+ try:
923
+ next_data = status_response.json()
924
+ except:
925
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
926
+ data.extend(next_data.get('data', []))
927
+ status_data = next_data
928
+ except Exception as e:
929
+ logger.error(f"Error during pagination request: {e}")
930
+ break
931
+ status_data['data'] = data
932
+
933
+ response = {
934
+ 'status': status_data.get('status'),
935
+ 'total': status_data.get('total'),
936
+ 'completed': status_data.get('completed'),
937
+ 'creditsUsed': status_data.get('creditsUsed'),
938
+ 'expiresAt': status_data.get('expiresAt'),
939
+ 'data': status_data.get('data')
940
+ }
941
+
942
+ if 'error' in status_data:
943
+ response['error'] = status_data['error']
944
+
945
+ if 'next' in status_data:
946
+ response['next'] = status_data['next']
947
+
948
+ return CrawlStatusResponse(
949
+ success=False if 'error' in status_data else True,
950
+ **response
951
+ )
952
+ else:
953
+ self._handle_error(response, 'check crawl status')
954
+
955
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
956
+ """
957
+ Returns information about crawl errors.
958
+
959
+ Args:
960
+ id (str): The ID of the crawl job
961
+
962
+ Returns:
963
+ CrawlErrorsResponse containing:
964
+ * errors (List[Dict[str, str]]): List of errors with fields:
965
+ - id (str): Error ID
966
+ - timestamp (str): When the error occurred
967
+ - url (str): URL that caused the error
968
+ - error (str): Error message
969
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
970
+
971
+ Raises:
972
+ Exception: If error check fails
973
+ """
974
+ headers = self._prepare_headers()
975
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
976
+ if response.status_code == 200:
977
+ try:
978
+ return CrawlErrorsResponse(**response.json())
979
+ except:
980
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
981
+ else:
982
+ self._handle_error(response, "check crawl errors")
983
+
984
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
985
+ """
986
+ Cancel an asynchronous crawl job.
987
+
988
+ Args:
989
+ id (str): The ID of the crawl job to cancel
990
+
991
+ Returns:
992
+ Dict[str, Any] containing:
993
+ * success (bool): Whether cancellation was successful
994
+ * error (str, optional): Error message if cancellation failed
995
+
996
+ Raises:
997
+ Exception: If cancellation fails
998
+ """
999
+ headers = self._prepare_headers()
1000
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1001
+ if response.status_code == 200:
1002
+ try:
1003
+ return response.json()
1004
+ except:
1005
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1006
+ else:
1007
+ self._handle_error(response, "cancel crawl job")
1008
+
1009
+ def crawl_url_and_watch(
1010
+ self,
1011
+ url: str,
1012
+ *,
1013
+ include_paths: Optional[List[str]] = None,
1014
+ exclude_paths: Optional[List[str]] = None,
1015
+ max_depth: Optional[int] = None,
1016
+ max_discovery_depth: Optional[int] = None,
1017
+ limit: Optional[int] = None,
1018
+ allow_backward_links: Optional[bool] = None,
1019
+ allow_external_links: Optional[bool] = None,
1020
+ ignore_sitemap: Optional[bool] = None,
1021
+ scrape_options: Optional[ScrapeOptions] = None,
1022
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1023
+ deduplicate_similar_urls: Optional[bool] = None,
1024
+ ignore_query_parameters: Optional[bool] = None,
1025
+ regex_on_full_url: Optional[bool] = None,
1026
+ idempotency_key: Optional[str] = None,
1027
+ **kwargs
1028
+ ) -> 'CrawlWatcher':
1029
+ """
1030
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1031
+
1032
+ Args:
1033
+ url (str): Target URL to start crawling from
1034
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1035
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1036
+ max_depth (Optional[int]): Maximum crawl depth
1037
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1038
+ limit (Optional[int]): Maximum pages to crawl
1039
+ allow_backward_links (Optional[bool]): Follow parent directory links
1040
+ allow_external_links (Optional[bool]): Follow external domain links
1041
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1042
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1043
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1044
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1045
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1046
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1047
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1048
+ **kwargs: Additional parameters to pass to the API
1049
+
1050
+ Returns:
1051
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1052
+
1053
+ Raises:
1054
+ Exception: If crawl job fails to start
1055
+ """
1056
+ crawl_response = self.async_crawl_url(
1057
+ url,
1058
+ include_paths=include_paths,
1059
+ exclude_paths=exclude_paths,
1060
+ max_depth=max_depth,
1061
+ max_discovery_depth=max_discovery_depth,
1062
+ limit=limit,
1063
+ allow_backward_links=allow_backward_links,
1064
+ allow_external_links=allow_external_links,
1065
+ ignore_sitemap=ignore_sitemap,
1066
+ scrape_options=scrape_options,
1067
+ webhook=webhook,
1068
+ deduplicate_similar_urls=deduplicate_similar_urls,
1069
+ ignore_query_parameters=ignore_query_parameters,
1070
+ regex_on_full_url=regex_on_full_url,
1071
+ idempotency_key=idempotency_key,
1072
+ **kwargs
1073
+ )
1074
+ if crawl_response.success and crawl_response.id:
1075
+ return CrawlWatcher(crawl_response.id, self)
1076
+ else:
1077
+ raise Exception("Crawl job failed to start")
1078
+
1079
+ def map_url(
1080
+ self,
1081
+ url: str,
1082
+ *,
1083
+ search: Optional[str] = None,
1084
+ ignore_sitemap: Optional[bool] = None,
1085
+ include_subdomains: Optional[bool] = None,
1086
+ sitemap_only: Optional[bool] = None,
1087
+ limit: Optional[int] = None,
1088
+ timeout: Optional[int] = None,
1089
+ **kwargs) -> MapResponse:
1090
+ """
1091
+ Map and discover links from a URL.
1092
+
1093
+ Args:
1094
+ url (str): Target URL to map
1095
+ search (Optional[str]): Filter pattern for URLs
1096
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1097
+ include_subdomains (Optional[bool]): Include subdomain links
1098
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1099
+ limit (Optional[int]): Maximum URLs to return
1100
+ timeout (Optional[int]): Request timeout in milliseconds
1101
+ **kwargs: Additional parameters to pass to the API
1102
+
1103
+ Returns:
1104
+ MapResponse: Response containing:
1105
+ * success (bool): Whether request succeeded
1106
+ * links (List[str]): Discovered URLs
1107
+ * error (Optional[str]): Error message if any
1108
+
1109
+ Raises:
1110
+ Exception: If mapping fails or response cannot be parsed
1111
+ """
1112
+ # Validate any additional kwargs
1113
+ self._validate_kwargs(kwargs, "map_url")
1114
+
1115
+ # Build map parameters
1116
+ map_params = {}
1117
+
1118
+ # Add individual parameters
1119
+ if search is not None:
1120
+ map_params['search'] = search
1121
+ if ignore_sitemap is not None:
1122
+ map_params['ignoreSitemap'] = ignore_sitemap
1123
+ if include_subdomains is not None:
1124
+ map_params['includeSubdomains'] = include_subdomains
1125
+ if sitemap_only is not None:
1126
+ map_params['sitemapOnly'] = sitemap_only
1127
+ if limit is not None:
1128
+ map_params['limit'] = limit
1129
+ if timeout is not None:
1130
+ map_params['timeout'] = timeout
1131
+
1132
+ # Add any additional kwargs
1133
+ map_params.update(kwargs)
1134
+
1135
+ # Create final params object
1136
+ final_params = MapParams(**map_params)
1137
+ params_dict = final_params.dict(exclude_none=True)
1138
+ params_dict['url'] = url
1139
+ params_dict['origin'] = f"python-sdk@{version}"
1140
+
1141
+ # Make request
1142
+ response = requests.post(
1143
+ f"{self.api_url}/v1/map",
1144
+ headers={"Authorization": f"Bearer {self.api_key}"},
1145
+ json=params_dict
1146
+ )
1147
+
1148
+ if response.status_code == 200:
1149
+ try:
1150
+ response_json = response.json()
1151
+ if response_json.get('success') and 'links' in response_json:
1152
+ return MapResponse(**response_json)
1153
+ elif "error" in response_json:
1154
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1155
+ else:
1156
+ raise Exception(f'Map failed. Error: {response_json}')
1157
+ except ValueError:
1158
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1159
+ else:
1160
+ self._handle_error(response, 'map')
1161
+
1162
+ def batch_scrape_urls(
1163
+ self,
1164
+ urls: List[str],
1165
+ *,
1166
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1167
+ headers: Optional[Dict[str, str]] = None,
1168
+ include_tags: Optional[List[str]] = None,
1169
+ exclude_tags: Optional[List[str]] = None,
1170
+ only_main_content: Optional[bool] = None,
1171
+ wait_for: Optional[int] = None,
1172
+ timeout: Optional[int] = None,
1173
+ location: Optional[LocationConfig] = None,
1174
+ mobile: Optional[bool] = None,
1175
+ skip_tls_verification: Optional[bool] = None,
1176
+ remove_base64_images: Optional[bool] = None,
1177
+ block_ads: Optional[bool] = None,
1178
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1179
+ extract: Optional[JsonConfig] = None,
1180
+ json_options: Optional[JsonConfig] = None,
1181
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1182
+ agent: Optional[AgentOptions] = None,
1183
+ poll_interval: Optional[int] = 2,
1184
+ idempotency_key: Optional[str] = None,
1185
+ **kwargs
1186
+ ) -> BatchScrapeStatusResponse:
1187
+ """
1188
+ Batch scrape multiple URLs and monitor until completion.
1189
+
1190
+ Args:
1191
+ urls (List[str]): URLs to scrape
1192
+ formats (Optional[List[Literal]]): Content formats to retrieve
1193
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1194
+ include_tags (Optional[List[str]]): HTML tags to include
1195
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1196
+ only_main_content (Optional[bool]): Extract main content only
1197
+ wait_for (Optional[int]): Wait time in milliseconds
1198
+ timeout (Optional[int]): Request timeout in milliseconds
1199
+ location (Optional[LocationConfig]): Location configuration
1200
+ mobile (Optional[bool]): Use mobile user agent
1201
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1202
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1203
+ block_ads (Optional[bool]): Block advertisements
1204
+ proxy (Optional[Literal]): Proxy type to use
1205
+ extract (Optional[JsonConfig]): Content extraction config
1206
+ json_options (Optional[JsonConfig]): JSON extraction config
1207
+ actions (Optional[List[Union]]): Actions to perform
1208
+ agent (Optional[AgentOptions]): Agent configuration
1209
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1210
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1211
+ **kwargs: Additional parameters to pass to the API
1212
+
1213
+ Returns:
1214
+ BatchScrapeStatusResponse with:
1215
+ * Scraping status and progress
1216
+ * Scraped content for each URL
1217
+ * Success/error information
1218
+
1219
+ Raises:
1220
+ Exception: If batch scrape fails
1221
+ """
1222
+ # Validate any additional kwargs
1223
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1224
+
1225
+ scrape_params = {}
1226
+
1227
+ # Add individual parameters
1228
+ if formats is not None:
1229
+ scrape_params['formats'] = formats
1230
+ if headers is not None:
1231
+ scrape_params['headers'] = headers
1232
+ if include_tags is not None:
1233
+ scrape_params['includeTags'] = include_tags
1234
+ if exclude_tags is not None:
1235
+ scrape_params['excludeTags'] = exclude_tags
1236
+ if only_main_content is not None:
1237
+ scrape_params['onlyMainContent'] = only_main_content
1238
+ if wait_for is not None:
1239
+ scrape_params['waitFor'] = wait_for
1240
+ if timeout is not None:
1241
+ scrape_params['timeout'] = timeout
1242
+ if location is not None:
1243
+ scrape_params['location'] = location.dict(exclude_none=True)
1244
+ if mobile is not None:
1245
+ scrape_params['mobile'] = mobile
1246
+ if skip_tls_verification is not None:
1247
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1248
+ if remove_base64_images is not None:
1249
+ scrape_params['removeBase64Images'] = remove_base64_images
1250
+ if block_ads is not None:
1251
+ scrape_params['blockAds'] = block_ads
1252
+ if proxy is not None:
1253
+ scrape_params['proxy'] = proxy
1254
+ if extract is not None:
1255
+ if hasattr(extract.schema, 'schema'):
1256
+ extract.schema = extract.schema.schema()
1257
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1258
+ if json_options is not None:
1259
+ if hasattr(json_options.schema, 'schema'):
1260
+ json_options.schema = json_options.schema.schema()
1261
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1262
+ if actions is not None:
1263
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1264
+ if agent is not None:
1265
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1266
+
1267
+ # Add any additional kwargs
1268
+ scrape_params.update(kwargs)
1269
+
1270
+ # Create final params object
1271
+ final_params = ScrapeParams(**scrape_params)
1272
+ params_dict = final_params.dict(exclude_none=True)
1273
+ params_dict['urls'] = urls
1274
+ params_dict['origin'] = f"python-sdk@{version}"
1275
+
1276
+ # Make request
1277
+ headers = self._prepare_headers(idempotency_key)
1278
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1279
+
1280
+ if response.status_code == 200:
1281
+ try:
1282
+ id = response.json().get('id')
1283
+ except:
1284
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1285
+ return self._monitor_job_status(id, headers, poll_interval)
1286
+ else:
1287
+ self._handle_error(response, 'start batch scrape job')
1288
+
1289
+ def async_batch_scrape_urls(
1290
+ self,
1291
+ urls: List[str],
1292
+ *,
1293
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1294
+ headers: Optional[Dict[str, str]] = None,
1295
+ include_tags: Optional[List[str]] = None,
1296
+ exclude_tags: Optional[List[str]] = None,
1297
+ only_main_content: Optional[bool] = None,
1298
+ wait_for: Optional[int] = None,
1299
+ timeout: Optional[int] = None,
1300
+ location: Optional[LocationConfig] = None,
1301
+ mobile: Optional[bool] = None,
1302
+ skip_tls_verification: Optional[bool] = None,
1303
+ remove_base64_images: Optional[bool] = None,
1304
+ block_ads: Optional[bool] = None,
1305
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1306
+ extract: Optional[JsonConfig] = None,
1307
+ json_options: Optional[JsonConfig] = None,
1308
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1309
+ agent: Optional[AgentOptions] = None,
1310
+ idempotency_key: Optional[str] = None,
1311
+ **kwargs
1312
+ ) -> BatchScrapeResponse:
1313
+ """
1314
+ Initiate a batch scrape job asynchronously.
1315
+
1316
+ Args:
1317
+ urls (List[str]): URLs to scrape
1318
+ formats (Optional[List[Literal]]): Content formats to retrieve
1319
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1320
+ include_tags (Optional[List[str]]): HTML tags to include
1321
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1322
+ only_main_content (Optional[bool]): Extract main content only
1323
+ wait_for (Optional[int]): Wait time in milliseconds
1324
+ timeout (Optional[int]): Request timeout in milliseconds
1325
+ location (Optional[LocationConfig]): Location configuration
1326
+ mobile (Optional[bool]): Use mobile user agent
1327
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1328
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1329
+ block_ads (Optional[bool]): Block advertisements
1330
+ proxy (Optional[Literal]): Proxy type to use
1331
+ extract (Optional[JsonConfig]): Content extraction config
1332
+ json_options (Optional[JsonConfig]): JSON extraction config
1333
+ actions (Optional[List[Union]]): Actions to perform
1334
+ agent (Optional[AgentOptions]): Agent configuration
1335
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1336
+ **kwargs: Additional parameters to pass to the API
1337
+
1338
+ Returns:
1339
+ BatchScrapeResponse with:
1340
+ * success - Whether job started successfully
1341
+ * id - Unique identifier for the job
1342
+ * url - Status check URL
1343
+ * error - Error message if start failed
1344
+
1345
+ Raises:
1346
+ Exception: If job initiation fails
1347
+ """
1348
+ # Validate any additional kwargs
1349
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1350
+
1351
+ scrape_params = {}
1352
+
1353
+ # Add individual parameters
1354
+ if formats is not None:
1355
+ scrape_params['formats'] = formats
1356
+ if headers is not None:
1357
+ scrape_params['headers'] = headers
1358
+ if include_tags is not None:
1359
+ scrape_params['includeTags'] = include_tags
1360
+ if exclude_tags is not None:
1361
+ scrape_params['excludeTags'] = exclude_tags
1362
+ if only_main_content is not None:
1363
+ scrape_params['onlyMainContent'] = only_main_content
1364
+ if wait_for is not None:
1365
+ scrape_params['waitFor'] = wait_for
1366
+ if timeout is not None:
1367
+ scrape_params['timeout'] = timeout
1368
+ if location is not None:
1369
+ scrape_params['location'] = location.dict(exclude_none=True)
1370
+ if mobile is not None:
1371
+ scrape_params['mobile'] = mobile
1372
+ if skip_tls_verification is not None:
1373
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1374
+ if remove_base64_images is not None:
1375
+ scrape_params['removeBase64Images'] = remove_base64_images
1376
+ if block_ads is not None:
1377
+ scrape_params['blockAds'] = block_ads
1378
+ if proxy is not None:
1379
+ scrape_params['proxy'] = proxy
1380
+ if extract is not None:
1381
+ if hasattr(extract.schema, 'schema'):
1382
+ extract.schema = extract.schema.schema()
1383
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1384
+ if json_options is not None:
1385
+ if hasattr(json_options.schema, 'schema'):
1386
+ json_options.schema = json_options.schema.schema()
1387
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1388
+ if actions is not None:
1389
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1390
+ if agent is not None:
1391
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1392
+
1393
+ # Add any additional kwargs
1394
+ scrape_params.update(kwargs)
1395
+
1396
+ # Create final params object
1397
+ final_params = ScrapeParams(**scrape_params)
1398
+ params_dict = final_params.dict(exclude_none=True)
1399
+ params_dict['urls'] = urls
1400
+ params_dict['origin'] = f"python-sdk@{version}"
1401
+
1402
+ # Make request
1403
+ headers = self._prepare_headers(idempotency_key)
1404
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1405
+
1406
+ if response.status_code == 200:
1407
+ try:
1408
+ return BatchScrapeResponse(**response.json())
1409
+ except:
1410
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1411
+ else:
1412
+ self._handle_error(response, 'start batch scrape job')
1413
+
1414
+ def batch_scrape_urls_and_watch(
1415
+ self,
1416
+ urls: List[str],
1417
+ *,
1418
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1419
+ headers: Optional[Dict[str, str]] = None,
1420
+ include_tags: Optional[List[str]] = None,
1421
+ exclude_tags: Optional[List[str]] = None,
1422
+ only_main_content: Optional[bool] = None,
1423
+ wait_for: Optional[int] = None,
1424
+ timeout: Optional[int] = None,
1425
+ location: Optional[LocationConfig] = None,
1426
+ mobile: Optional[bool] = None,
1427
+ skip_tls_verification: Optional[bool] = None,
1428
+ remove_base64_images: Optional[bool] = None,
1429
+ block_ads: Optional[bool] = None,
1430
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1431
+ extract: Optional[JsonConfig] = None,
1432
+ json_options: Optional[JsonConfig] = None,
1433
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1434
+ agent: Optional[AgentOptions] = None,
1435
+ idempotency_key: Optional[str] = None,
1436
+ **kwargs
1437
+ ) -> 'CrawlWatcher':
1438
+ """
1439
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1440
+
1441
+ Args:
1442
+ urls (List[str]): URLs to scrape
1443
+ formats (Optional[List[Literal]]): Content formats to retrieve
1444
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1445
+ include_tags (Optional[List[str]]): HTML tags to include
1446
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1447
+ only_main_content (Optional[bool]): Extract main content only
1448
+ wait_for (Optional[int]): Wait time in milliseconds
1449
+ timeout (Optional[int]): Request timeout in milliseconds
1450
+ location (Optional[LocationConfig]): Location configuration
1451
+ mobile (Optional[bool]): Use mobile user agent
1452
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1453
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1454
+ block_ads (Optional[bool]): Block advertisements
1455
+ proxy (Optional[Literal]): Proxy type to use
1456
+ extract (Optional[JsonConfig]): Content extraction config
1457
+ json_options (Optional[JsonConfig]): JSON extraction config
1458
+ actions (Optional[List[Union]]): Actions to perform
1459
+ agent (Optional[AgentOptions]): Agent configuration
1460
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1461
+ **kwargs: Additional parameters to pass to the API
1462
+
1463
+ Returns:
1464
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1465
+
1466
+ Raises:
1467
+ Exception: If batch scrape job fails to start
1468
+ """
1469
+ # Validate any additional kwargs
1470
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1471
+
1472
+ scrape_params = {}
1473
+
1474
+ # Add individual parameters
1475
+ if formats is not None:
1476
+ scrape_params['formats'] = formats
1477
+ if headers is not None:
1478
+ scrape_params['headers'] = headers
1479
+ if include_tags is not None:
1480
+ scrape_params['includeTags'] = include_tags
1481
+ if exclude_tags is not None:
1482
+ scrape_params['excludeTags'] = exclude_tags
1483
+ if only_main_content is not None:
1484
+ scrape_params['onlyMainContent'] = only_main_content
1485
+ if wait_for is not None:
1486
+ scrape_params['waitFor'] = wait_for
1487
+ if timeout is not None:
1488
+ scrape_params['timeout'] = timeout
1489
+ if location is not None:
1490
+ scrape_params['location'] = location.dict(exclude_none=True)
1491
+ if mobile is not None:
1492
+ scrape_params['mobile'] = mobile
1493
+ if skip_tls_verification is not None:
1494
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1495
+ if remove_base64_images is not None:
1496
+ scrape_params['removeBase64Images'] = remove_base64_images
1497
+ if block_ads is not None:
1498
+ scrape_params['blockAds'] = block_ads
1499
+ if proxy is not None:
1500
+ scrape_params['proxy'] = proxy
1501
+ if extract is not None:
1502
+ if hasattr(extract.schema, 'schema'):
1503
+ extract.schema = extract.schema.schema()
1504
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1505
+ if json_options is not None:
1506
+ if hasattr(json_options.schema, 'schema'):
1507
+ json_options.schema = json_options.schema.schema()
1508
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1509
+ if actions is not None:
1510
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1511
+ if agent is not None:
1512
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1513
+
1514
+ # Add any additional kwargs
1515
+ scrape_params.update(kwargs)
1516
+
1517
+ # Create final params object
1518
+ final_params = ScrapeParams(**scrape_params)
1519
+ params_dict = final_params.dict(exclude_none=True)
1520
+ params_dict['urls'] = urls
1521
+ params_dict['origin'] = f"python-sdk@{version}"
1522
+
1523
+ # Make request
1524
+ headers = self._prepare_headers(idempotency_key)
1525
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1526
+
1527
+ if response.status_code == 200:
1528
+ try:
1529
+ crawl_response = BatchScrapeResponse(**response.json())
1530
+ if crawl_response.success and crawl_response.id:
1531
+ return CrawlWatcher(crawl_response.id, self)
1532
+ else:
1533
+ raise Exception("Batch scrape job failed to start")
1534
+ except:
1535
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1536
+ else:
1537
+ self._handle_error(response, 'start batch scrape job')
1538
+
1539
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1540
+ """
1541
+ Check the status of a batch scrape job using the Firecrawl API.
1542
+
1543
+ Args:
1544
+ id (str): The ID of the batch scrape job.
1545
+
1546
+ Returns:
1547
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1548
+
1549
+ Raises:
1550
+ Exception: If the status check request fails.
1551
+ """
1552
+ endpoint = f'/v1/batch/scrape/{id}'
1553
+
1554
+ headers = self._prepare_headers()
1555
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1556
+ if response.status_code == 200:
1557
+ try:
1558
+ status_data = response.json()
1559
+ except:
1560
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1561
+ if status_data['status'] == 'completed':
1562
+ if 'data' in status_data:
1563
+ data = status_data['data']
1564
+ while 'next' in status_data:
1565
+ if len(status_data['data']) == 0:
1566
+ break
1567
+ next_url = status_data.get('next')
1568
+ if not next_url:
1569
+ logger.warning("Expected 'next' URL is missing.")
1570
+ break
1571
+ try:
1572
+ status_response = self._get_request(next_url, headers)
1573
+ if status_response.status_code != 200:
1574
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1575
+ break
1576
+ try:
1577
+ next_data = status_response.json()
1578
+ except:
1579
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1580
+ data.extend(next_data.get('data', []))
1581
+ status_data = next_data
1582
+ except Exception as e:
1583
+ logger.error(f"Error during pagination request: {e}")
1584
+ break
1585
+ status_data['data'] = data
1586
+
1587
+ return BatchScrapeStatusResponse(**{
1588
+ 'success': False if 'error' in status_data else True,
1589
+ 'status': status_data.get('status'),
1590
+ 'total': status_data.get('total'),
1591
+ 'completed': status_data.get('completed'),
1592
+ 'creditsUsed': status_data.get('creditsUsed'),
1593
+ 'expiresAt': status_data.get('expiresAt'),
1594
+ 'data': status_data.get('data'),
1595
+ 'next': status_data.get('next'),
1596
+ 'error': status_data.get('error')
1597
+ })
1598
+ else:
1599
+ self._handle_error(response, 'check batch scrape status')
1600
+
1601
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1602
+ """
1603
+ Returns information about batch scrape errors.
1604
+
1605
+ Args:
1606
+ id (str): The ID of the crawl job.
1607
+
1608
+ Returns:
1609
+ CrawlErrorsResponse: A response containing:
1610
+ * errors (List[Dict[str, str]]): List of errors with fields:
1611
+ * id (str): Error ID
1612
+ * timestamp (str): When the error occurred
1613
+ * url (str): URL that caused the error
1614
+ * error (str): Error message
1615
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1616
+
1617
+ Raises:
1618
+ Exception: If the error check request fails
1619
+ """
1620
+ headers = self._prepare_headers()
1621
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1622
+ if response.status_code == 200:
1623
+ try:
1624
+ return CrawlErrorsResponse(**response.json())
1625
+ except:
1626
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1627
+ else:
1628
+ self._handle_error(response, "check batch scrape errors")
1629
+
1630
+ def extract(
1631
+ self,
1632
+ urls: Optional[List[str]] = None,
1633
+ *,
1634
+ prompt: Optional[str] = None,
1635
+ schema: Optional[Any] = None,
1636
+ system_prompt: Optional[str] = None,
1637
+ allow_external_links: Optional[bool] = False,
1638
+ enable_web_search: Optional[bool] = False,
1639
+ show_sources: Optional[bool] = False,
1640
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1641
+ """
1642
+ Extract structured information from URLs.
1643
+
1644
+ Args:
1645
+ urls (Optional[List[str]]): URLs to extract from
1646
+ prompt (Optional[str]): Custom extraction prompt
1647
+ schema (Optional[Any]): JSON schema/Pydantic model
1648
+ system_prompt (Optional[str]): System context
1649
+ allow_external_links (Optional[bool]): Follow external links
1650
+ enable_web_search (Optional[bool]): Enable web search
1651
+ show_sources (Optional[bool]): Include source URLs
1652
+ agent (Optional[Dict[str, Any]]): Agent configuration
1653
+
1654
+ Returns:
1655
+ ExtractResponse[Any] with:
1656
+ * success (bool): Whether request succeeded
1657
+ * data (Optional[Any]): Extracted data matching schema
1658
+ * error (Optional[str]): Error message if any
1659
+
1660
+ Raises:
1661
+ ValueError: If prompt/schema missing or extraction fails
1662
+ """
1663
+ headers = self._prepare_headers()
1664
+
1665
+ if not prompt and not schema:
1666
+ raise ValueError("Either prompt or schema is required")
1667
+
1668
+ if not urls and not prompt:
1669
+ raise ValueError("Either urls or prompt is required")
1670
+
1671
+ if schema:
1672
+ if hasattr(schema, 'model_json_schema'):
1673
+ # Convert Pydantic model to JSON schema
1674
+ schema = schema.model_json_schema()
1675
+ # Otherwise assume it's already a JSON schema dict
1676
+
1677
+ request_data = {
1678
+ 'urls': urls or [],
1679
+ 'allowExternalLinks': allow_external_links,
1680
+ 'enableWebSearch': enable_web_search,
1681
+ 'showSources': show_sources,
1682
+ 'schema': schema,
1683
+ 'origin': f'python-sdk@{get_version()}'
1684
+ }
1685
+
1686
+ # Only add prompt and systemPrompt if they exist
1687
+ if prompt:
1688
+ request_data['prompt'] = prompt
1689
+ if system_prompt:
1690
+ request_data['systemPrompt'] = system_prompt
1691
+
1692
+ if agent:
1693
+ request_data['agent'] = agent
1694
+
1695
+ try:
1696
+ # Send the initial extract request
1697
+ response = self._post_request(
1698
+ f'{self.api_url}/v1/extract',
1699
+ request_data,
1700
+ headers
1701
+ )
1702
+ if response.status_code == 200:
1703
+ try:
1704
+ data = response.json()
1705
+ except:
1706
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1707
+ if data['success']:
1708
+ job_id = data.get('id')
1709
+ if not job_id:
1710
+ raise Exception('Job ID not returned from extract request.')
1711
+
1712
+ # Poll for the extract status
1713
+ while True:
1714
+ status_response = self._get_request(
1715
+ f'{self.api_url}/v1/extract/{job_id}',
1716
+ headers
1717
+ )
1718
+ if status_response.status_code == 200:
1719
+ try:
1720
+ status_data = status_response.json()
1721
+ except:
1722
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1723
+ if status_data['status'] == 'completed':
1724
+ return ExtractResponse(**status_data)
1725
+ elif status_data['status'] in ['failed', 'cancelled']:
1726
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1727
+ else:
1728
+ self._handle_error(status_response, "extract-status")
1729
+
1730
+ time.sleep(2) # Polling interval
1731
+ else:
1732
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1733
+ else:
1734
+ self._handle_error(response, "extract")
1735
+ except Exception as e:
1736
+ raise ValueError(str(e), 500)
1737
+
1738
+ return ExtractResponse(success=False, error="Internal server error.")
1739
+
1740
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1741
+ """
1742
+ Retrieve the status of an extract job.
1743
+
1744
+ Args:
1745
+ job_id (str): The ID of the extract job.
1746
+
1747
+ Returns:
1748
+ ExtractResponse[Any]: The status of the extract job.
1749
+
1750
+ Raises:
1751
+ ValueError: If there is an error retrieving the status.
1752
+ """
1753
+ headers = self._prepare_headers()
1754
+ try:
1755
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1756
+ if response.status_code == 200:
1757
+ try:
1758
+ return ExtractResponse(**response.json())
1759
+ except:
1760
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1761
+ else:
1762
+ self._handle_error(response, "get extract status")
1763
+ except Exception as e:
1764
+ raise ValueError(str(e), 500)
1765
+
1766
+ def async_extract(
1767
+ self,
1768
+ urls: Optional[List[str]] = None,
1769
+ *,
1770
+ prompt: Optional[str] = None,
1771
+ schema: Optional[Any] = None,
1772
+ system_prompt: Optional[str] = None,
1773
+ allow_external_links: Optional[bool] = False,
1774
+ enable_web_search: Optional[bool] = False,
1775
+ show_sources: Optional[bool] = False,
1776
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1777
+ """
1778
+ Initiate an asynchronous extract job.
1779
+
1780
+ Args:
1781
+ urls (List[str]): URLs to extract information from
1782
+ prompt (Optional[str]): Custom extraction prompt
1783
+ schema (Optional[Any]): JSON schema/Pydantic model
1784
+ system_prompt (Optional[str]): System context
1785
+ allow_external_links (Optional[bool]): Follow external links
1786
+ enable_web_search (Optional[bool]): Enable web search
1787
+ show_sources (Optional[bool]): Include source URLs
1788
+ agent (Optional[Dict[str, Any]]): Agent configuration
1789
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1790
+
1791
+ Returns:
1792
+ ExtractResponse[Any] with:
1793
+ * success (bool): Whether request succeeded
1794
+ * data (Optional[Any]): Extracted data matching schema
1795
+ * error (Optional[str]): Error message if any
1796
+
1797
+ Raises:
1798
+ ValueError: If job initiation fails
1799
+ """
1800
+ headers = self._prepare_headers()
1801
+
1802
+ schema = schema
1803
+ if schema:
1804
+ if hasattr(schema, 'model_json_schema'):
1805
+ # Convert Pydantic model to JSON schema
1806
+ schema = schema.model_json_schema()
1807
+ # Otherwise assume it's already a JSON schema dict
1808
+
1809
+ request_data = {
1810
+ 'urls': urls,
1811
+ 'allowExternalLinks': allow_external_links,
1812
+ 'enableWebSearch': enable_web_search,
1813
+ 'showSources': show_sources,
1814
+ 'schema': schema,
1815
+ 'origin': f'python-sdk@{version}'
1816
+ }
1817
+
1818
+ if prompt:
1819
+ request_data['prompt'] = prompt
1820
+ if system_prompt:
1821
+ request_data['systemPrompt'] = system_prompt
1822
+ if agent:
1823
+ request_data['agent'] = agent
1824
+
1825
+ try:
1826
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1827
+ if response.status_code == 200:
1828
+ try:
1829
+ return ExtractResponse(**response.json())
1830
+ except:
1831
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1832
+ else:
1833
+ self._handle_error(response, "async extract")
1834
+ except Exception as e:
1835
+ raise ValueError(str(e), 500)
1836
+
1837
+ def generate_llms_text(
1838
+ self,
1839
+ url: str,
1840
+ *,
1841
+ max_urls: Optional[int] = None,
1842
+ show_full_text: Optional[bool] = None,
1843
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1844
+ """
1845
+ Generate LLMs.txt for a given URL and poll until completion.
1846
+
1847
+ Args:
1848
+ url (str): Target URL to generate LLMs.txt from
1849
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1850
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1851
+ experimental_stream (Optional[bool]): Enable experimental streaming
1852
+
1853
+ Returns:
1854
+ GenerateLLMsTextStatusResponse with:
1855
+ * Generated LLMs.txt content
1856
+ * Full version if requested
1857
+ * Generation status
1858
+ * Success/error information
1859
+
1860
+ Raises:
1861
+ Exception: If generation fails
1862
+ """
1863
+ params = GenerateLLMsTextParams(
1864
+ maxUrls=max_urls,
1865
+ showFullText=show_full_text,
1866
+ __experimental_stream=experimental_stream
1867
+ )
1868
+
1869
+ response = self.async_generate_llms_text(
1870
+ url,
1871
+ max_urls=max_urls,
1872
+ show_full_text=show_full_text,
1873
+ experimental_stream=experimental_stream
1874
+ )
1875
+
1876
+ if not response.success or not response.id:
1877
+ return GenerateLLMsTextStatusResponse(
1878
+ success=False,
1879
+ error='Failed to start LLMs.txt generation',
1880
+ status='failed',
1881
+ expiresAt=''
1882
+ )
1883
+
1884
+ job_id = response.id
1885
+ while True:
1886
+ status = self.check_generate_llms_text_status(job_id)
1887
+
1888
+ if status.status == 'completed':
1889
+ return status
1890
+ elif status.status == 'failed':
1891
+ return status
1892
+ elif status.status != 'processing':
1893
+ return GenerateLLMsTextStatusResponse(
1894
+ success=False,
1895
+ error='LLMs.txt generation job terminated unexpectedly',
1896
+ status='failed',
1897
+ expiresAt=''
1898
+ )
1899
+
1900
+ time.sleep(2) # Polling interval
1901
+
1902
+ def async_generate_llms_text(
1903
+ self,
1904
+ url: str,
1905
+ *,
1906
+ max_urls: Optional[int] = None,
1907
+ show_full_text: Optional[bool] = None,
1908
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1909
+ """
1910
+ Initiate an asynchronous LLMs.txt generation operation.
1911
+
1912
+ Args:
1913
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1914
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1915
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1916
+ experimental_stream (Optional[bool]): Enable experimental streaming
1917
+
1918
+ Returns:
1919
+ GenerateLLMsTextResponse: A response containing:
1920
+ * success (bool): Whether the generation initiation was successful
1921
+ * id (str): The unique identifier for the generation job
1922
+ * error (str, optional): Error message if initiation failed
1923
+
1924
+ Raises:
1925
+ Exception: If the generation job initiation fails.
1926
+ """
1927
+ params = GenerateLLMsTextParams(
1928
+ maxUrls=max_urls,
1929
+ showFullText=show_full_text,
1930
+ __experimental_stream=experimental_stream
1931
+ )
1932
+
1933
+ headers = self._prepare_headers()
1934
+ json_data = {'url': url, **params.dict(exclude_none=True)}
1935
+ json_data['origin'] = f"python-sdk@{version}"
1936
+
1937
+ try:
1938
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1939
+ response = req.json()
1940
+ print("json_data", json_data)
1941
+ print("response", response)
1942
+ if response.get('success'):
1943
+ try:
1944
+ return GenerateLLMsTextResponse(**response)
1945
+ except:
1946
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1947
+ else:
1948
+ self._handle_error(response, 'start LLMs.txt generation')
1949
+ except Exception as e:
1950
+ raise ValueError(str(e))
1951
+
1952
+ return GenerateLLMsTextResponse(
1953
+ success=False,
1954
+ error='Internal server error'
1955
+ )
1956
+
1957
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
1958
+ """
1959
+ Check the status of a LLMs.txt generation operation.
1960
+
1961
+ Args:
1962
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
1963
+
1964
+ Returns:
1965
+ GenerateLLMsTextStatusResponse: A response containing:
1966
+ * success (bool): Whether the generation was successful
1967
+ * status (str): Status of generation ("processing", "completed", "failed")
1968
+ * data (Dict[str, str], optional): Generated text with fields:
1969
+ * llmstxt (str): Generated LLMs.txt content
1970
+ * llmsfulltxt (str, optional): Full version if requested
1971
+ * error (str, optional): Error message if generation failed
1972
+ * expiresAt (str): When the generated data expires
1973
+
1974
+ Raises:
1975
+ Exception: If the status check fails.
1976
+ """
1977
+ headers = self._prepare_headers()
1978
+ try:
1979
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
1980
+ if response.status_code == 200:
1981
+ try:
1982
+ json_data = response.json()
1983
+ return GenerateLLMsTextStatusResponse(**json_data)
1984
+ except Exception as e:
1985
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
1986
+ elif response.status_code == 404:
1987
+ raise Exception('LLMs.txt generation job not found')
1988
+ else:
1989
+ self._handle_error(response, 'check LLMs.txt generation status')
1990
+ except Exception as e:
1991
+ raise ValueError(str(e))
1992
+
1993
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
1994
+
1995
+ def _prepare_headers(
1996
+ self,
1997
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
1998
+ """
1999
+ Prepare the headers for API requests.
2000
+
2001
+ Args:
2002
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2003
+
2004
+ Returns:
2005
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2006
+ """
2007
+ if idempotency_key:
2008
+ return {
2009
+ 'Content-Type': 'application/json',
2010
+ 'Authorization': f'Bearer {self.api_key}',
2011
+ 'x-idempotency-key': idempotency_key
2012
+ }
2013
+
2014
+ return {
2015
+ 'Content-Type': 'application/json',
2016
+ 'Authorization': f'Bearer {self.api_key}',
2017
+ }
2018
+
2019
+ def _post_request(
2020
+ self,
2021
+ url: str,
2022
+ data: Dict[str, Any],
2023
+ headers: Dict[str, str],
2024
+ retries: int = 3,
2025
+ backoff_factor: float = 0.5) -> requests.Response:
2026
+ """
2027
+ Make a POST request with retries.
2028
+
2029
+ Args:
2030
+ url (str): The URL to send the POST request to.
2031
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2032
+ headers (Dict[str, str]): The headers to include in the POST request.
2033
+ retries (int): Number of retries for the request.
2034
+ backoff_factor (float): Backoff factor for retries.
2035
+
2036
+ Returns:
2037
+ requests.Response: The response from the POST request.
2038
+
2039
+ Raises:
2040
+ requests.RequestException: If the request fails after the specified retries.
2041
+ """
2042
+ for attempt in range(retries):
2043
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2044
+ if response.status_code == 502:
2045
+ time.sleep(backoff_factor * (2 ** attempt))
2046
+ else:
2047
+ return response
2048
+ return response
2049
+
2050
+ def _get_request(
2051
+ self,
2052
+ url: str,
2053
+ headers: Dict[str, str],
2054
+ retries: int = 3,
2055
+ backoff_factor: float = 0.5) -> requests.Response:
2056
+ """
2057
+ Make a GET request with retries.
2058
+
2059
+ Args:
2060
+ url (str): The URL to send the GET request to.
2061
+ headers (Dict[str, str]): The headers to include in the GET request.
2062
+ retries (int): Number of retries for the request.
2063
+ backoff_factor (float): Backoff factor for retries.
2064
+
2065
+ Returns:
2066
+ requests.Response: The response from the GET request.
2067
+
2068
+ Raises:
2069
+ requests.RequestException: If the request fails after the specified retries.
2070
+ """
2071
+ for attempt in range(retries):
2072
+ response = requests.get(url, headers=headers)
2073
+ if response.status_code == 502:
2074
+ time.sleep(backoff_factor * (2 ** attempt))
2075
+ else:
2076
+ return response
2077
+ return response
2078
+
2079
+ def _delete_request(
2080
+ self,
2081
+ url: str,
2082
+ headers: Dict[str, str],
2083
+ retries: int = 3,
2084
+ backoff_factor: float = 0.5) -> requests.Response:
2085
+ """
2086
+ Make a DELETE request with retries.
2087
+
2088
+ Args:
2089
+ url (str): The URL to send the DELETE request to.
2090
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2091
+ retries (int): Number of retries for the request.
2092
+ backoff_factor (float): Backoff factor for retries.
2093
+
2094
+ Returns:
2095
+ requests.Response: The response from the DELETE request.
2096
+
2097
+ Raises:
2098
+ requests.RequestException: If the request fails after the specified retries.
2099
+ """
2100
+ for attempt in range(retries):
2101
+ response = requests.delete(url, headers=headers)
2102
+ if response.status_code == 502:
2103
+ time.sleep(backoff_factor * (2 ** attempt))
2104
+ else:
2105
+ return response
2106
+ return response
2107
+
2108
+ def _monitor_job_status(
2109
+ self,
2110
+ id: str,
2111
+ headers: Dict[str, str],
2112
+ poll_interval: int) -> CrawlStatusResponse:
2113
+ """
2114
+ Monitor the status of a crawl job until completion.
2115
+
2116
+ Args:
2117
+ id (str): The ID of the crawl job.
2118
+ headers (Dict[str, str]): The headers to include in the status check requests.
2119
+ poll_interval (int): Seconds between status checks.
2120
+
2121
+ Returns:
2122
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2123
+
2124
+ Raises:
2125
+ Exception: If the job fails or an error occurs during status checks.
2126
+ """
2127
+ while True:
2128
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2129
+
2130
+ status_response = self._get_request(api_url, headers)
2131
+ if status_response.status_code == 200:
2132
+ try:
2133
+ status_data = status_response.json()
2134
+ except:
2135
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2136
+ if status_data['status'] == 'completed':
2137
+ if 'data' in status_data:
2138
+ data = status_data['data']
2139
+ while 'next' in status_data:
2140
+ if len(status_data['data']) == 0:
2141
+ break
2142
+ status_response = self._get_request(status_data['next'], headers)
2143
+ try:
2144
+ status_data = status_response.json()
2145
+ except:
2146
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2147
+ data.extend(status_data.get('data', []))
2148
+ status_data['data'] = data
2149
+ return CrawlStatusResponse(**status_data)
2150
+ else:
2151
+ raise Exception('Crawl job completed but no data was returned')
2152
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2153
+ poll_interval=max(poll_interval,2)
2154
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2155
+ else:
2156
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2157
+ else:
2158
+ self._handle_error(status_response, 'check crawl status')
2159
+
2160
+ def _handle_error(
2161
+ self,
2162
+ response: requests.Response,
2163
+ action: str) -> None:
2164
+ """
2165
+ Handle errors from API responses.
2166
+
2167
+ Args:
2168
+ response (requests.Response): The response object from the API request.
2169
+ action (str): Description of the action that was being performed.
2170
+
2171
+ Raises:
2172
+ Exception: An exception with a message containing the status code and error details from the response.
2173
+ """
2174
+ try:
2175
+ error_message = response.json().get('error', 'No error message provided.')
2176
+ error_details = response.json().get('details', 'No additional error details provided.')
2177
+ except:
2178
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2179
+
2180
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2181
+
2182
+ # Raise an HTTPError with the custom message and attach the response
2183
+ raise requests.exceptions.HTTPError(message, response=response)
2184
+
2185
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2186
+ """
2187
+ Generate a standardized error message based on HTTP status code.
2188
+
2189
+ Args:
2190
+ status_code (int): The HTTP status code from the response
2191
+ action (str): Description of the action that was being performed
2192
+ error_message (str): The error message from the API response
2193
+ error_details (str): Additional error details from the API response
2194
+
2195
+ Returns:
2196
+ str: A formatted error message
2197
+ """
2198
+ if status_code == 402:
2199
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2200
+ elif status_code == 403:
2201
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2202
+ elif status_code == 408:
2203
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2204
+ elif status_code == 409:
2205
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2206
+ elif status_code == 500:
2207
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2208
+ else:
2209
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2210
+
2211
+ def deep_research(
2212
+ self,
2213
+ query: str,
2214
+ *,
2215
+ max_depth: Optional[int] = None,
2216
+ time_limit: Optional[int] = None,
2217
+ max_urls: Optional[int] = None,
2218
+ analysis_prompt: Optional[str] = None,
2219
+ system_prompt: Optional[str] = None,
2220
+ __experimental_stream_steps: Optional[bool] = None,
2221
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2222
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2223
+ """
2224
+ Initiates a deep research operation on a given query and polls until completion.
2225
+
2226
+ Args:
2227
+ query (str): Research query or topic to investigate
2228
+ max_depth (Optional[int]): Maximum depth of research exploration
2229
+ time_limit (Optional[int]): Time limit in seconds for research
2230
+ max_urls (Optional[int]): Maximum number of URLs to process
2231
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2232
+ system_prompt (Optional[str]): Custom system prompt
2233
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2234
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2235
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2236
+
2237
+ Returns:
2238
+ DeepResearchStatusResponse containing:
2239
+ * success (bool): Whether research completed successfully
2240
+ * status (str): Current state (processing/completed/failed)
2241
+ * error (Optional[str]): Error message if failed
2242
+ * id (str): Unique identifier for the research job
2243
+ * data (Any): Research findings and analysis
2244
+ * sources (List[Dict]): List of discovered sources
2245
+ * activities (List[Dict]): Research progress log
2246
+ * summaries (List[str]): Generated research summaries
2247
+
2248
+ Raises:
2249
+ Exception: If research fails
2250
+ """
2251
+ research_params = {}
2252
+ if max_depth is not None:
2253
+ research_params['maxDepth'] = max_depth
2254
+ if time_limit is not None:
2255
+ research_params['timeLimit'] = time_limit
2256
+ if max_urls is not None:
2257
+ research_params['maxUrls'] = max_urls
2258
+ if analysis_prompt is not None:
2259
+ research_params['analysisPrompt'] = analysis_prompt
2260
+ if system_prompt is not None:
2261
+ research_params['systemPrompt'] = system_prompt
2262
+ if __experimental_stream_steps is not None:
2263
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2264
+ research_params = DeepResearchParams(**research_params)
2265
+
2266
+ response = self.async_deep_research(
2267
+ query,
2268
+ max_depth=max_depth,
2269
+ time_limit=time_limit,
2270
+ max_urls=max_urls,
2271
+ analysis_prompt=analysis_prompt,
2272
+ system_prompt=system_prompt
2273
+ )
2274
+ if not response.get('success') or 'id' not in response:
2275
+ return response
2276
+
2277
+ job_id = response['id']
2278
+ last_activity_count = 0
2279
+ last_source_count = 0
2280
+
2281
+ while True:
2282
+ status = self.check_deep_research_status(job_id)
2283
+
2284
+ if on_activity and 'activities' in status:
2285
+ new_activities = status['activities'][last_activity_count:]
2286
+ for activity in new_activities:
2287
+ on_activity(activity)
2288
+ last_activity_count = len(status['activities'])
2289
+
2290
+ if on_source and 'sources' in status:
2291
+ new_sources = status['sources'][last_source_count:]
2292
+ for source in new_sources:
2293
+ on_source(source)
2294
+ last_source_count = len(status['sources'])
2295
+
2296
+ if status['status'] == 'completed':
2297
+ return status
2298
+ elif status['status'] == 'failed':
2299
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2300
+ elif status['status'] != 'processing':
2301
+ break
2302
+
2303
+ time.sleep(2) # Polling interval
2304
+
2305
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2306
+
2307
+ def async_deep_research(
2308
+ self,
2309
+ query: str,
2310
+ *,
2311
+ max_depth: Optional[int] = None,
2312
+ time_limit: Optional[int] = None,
2313
+ max_urls: Optional[int] = None,
2314
+ analysis_prompt: Optional[str] = None,
2315
+ system_prompt: Optional[str] = None,
2316
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2317
+ """
2318
+ Initiates an asynchronous deep research operation.
2319
+
2320
+ Args:
2321
+ query (str): Research query or topic to investigate
2322
+ max_depth (Optional[int]): Maximum depth of research exploration
2323
+ time_limit (Optional[int]): Time limit in seconds for research
2324
+ max_urls (Optional[int]): Maximum number of URLs to process
2325
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2326
+ system_prompt (Optional[str]): Custom system prompt
2327
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2328
+
2329
+ Returns:
2330
+ Dict[str, Any]: A response containing:
2331
+ * success (bool): Whether the research initiation was successful
2332
+ * id (str): The unique identifier for the research job
2333
+ * error (str, optional): Error message if initiation failed
2334
+
2335
+ Raises:
2336
+ Exception: If the research initiation fails.
2337
+ """
2338
+ research_params = {}
2339
+ if max_depth is not None:
2340
+ research_params['maxDepth'] = max_depth
2341
+ if time_limit is not None:
2342
+ research_params['timeLimit'] = time_limit
2343
+ if max_urls is not None:
2344
+ research_params['maxUrls'] = max_urls
2345
+ if analysis_prompt is not None:
2346
+ research_params['analysisPrompt'] = analysis_prompt
2347
+ if system_prompt is not None:
2348
+ research_params['systemPrompt'] = system_prompt
2349
+ if __experimental_stream_steps is not None:
2350
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2351
+ research_params = DeepResearchParams(**research_params)
2352
+
2353
+ headers = self._prepare_headers()
2354
+
2355
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2356
+ json_data['origin'] = f"python-sdk@{version}"
2357
+
2358
+ # Handle json options schema if present
2359
+ if 'jsonOptions' in json_data:
2360
+ json_opts = json_data['jsonOptions']
2361
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2362
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2363
+
2364
+ try:
2365
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2366
+ if response.status_code == 200:
2367
+ try:
2368
+ return response.json()
2369
+ except:
2370
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2371
+ else:
2372
+ self._handle_error(response, 'start deep research')
2373
+ except Exception as e:
2374
+ raise ValueError(str(e))
2375
+
2376
+ return {'success': False, 'error': 'Internal server error'}
2377
+
2378
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2379
+ """
2380
+ Check the status of a deep research operation.
2381
+
2382
+ Args:
2383
+ id (str): The ID of the deep research operation.
2384
+
2385
+ Returns:
2386
+ DeepResearchResponse containing:
2387
+
2388
+ Status:
2389
+ * success - Whether research completed successfully
2390
+ * status - Current state (processing/completed/failed)
2391
+ * error - Error message if failed
2392
+
2393
+ Results:
2394
+ * id - Unique identifier for the research job
2395
+ * data - Research findings and analysis
2396
+ * sources - List of discovered sources
2397
+ * activities - Research progress log
2398
+ * summaries - Generated research summaries
2399
+
2400
+ Raises:
2401
+ Exception: If the status check fails.
2402
+ """
2403
+ headers = self._prepare_headers()
2404
+ try:
2405
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2406
+ if response.status_code == 200:
2407
+ try:
2408
+ return response.json()
2409
+ except:
2410
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2411
+ elif response.status_code == 404:
2412
+ raise Exception('Deep research job not found')
2413
+ else:
2414
+ self._handle_error(response, 'check deep research status')
2415
+ except Exception as e:
2416
+ raise ValueError(str(e))
2417
+
2418
+ return {'success': False, 'error': 'Internal server error'}
2419
+
2420
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2421
+ """
2422
+ Validate additional keyword arguments before they are passed to the API.
2423
+ This provides early validation before the Pydantic model validation.
2424
+
2425
+ Args:
2426
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2427
+ method_name (str): Name of the method these kwargs are for
2428
+
2429
+ Raises:
2430
+ ValueError: If kwargs contain invalid or unsupported parameters
2431
+ """
2432
+ if not kwargs:
2433
+ return
2434
+
2435
+ # Known parameter mappings for each method
2436
+ method_params = {
2437
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2438
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2439
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
2440
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2441
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2442
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2443
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2444
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2445
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2446
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2447
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2448
+ "actions", "agent", "webhook"},
2449
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2450
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2451
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2452
+ "actions", "agent", "webhook"},
2453
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2454
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2455
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2456
+ "actions", "agent", "webhook"}
2457
+ }
2458
+
2459
+ # Get allowed parameters for this method
2460
+ allowed_params = method_params.get(method_name, set())
2461
+
2462
+ # Check for unknown parameters
2463
+ unknown_params = set(kwargs.keys()) - allowed_params
2464
+ if unknown_params:
2465
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2466
+
2467
+ # Additional type validation can be added here if needed
2468
+ # For now, we rely on Pydantic models for detailed type validation
2469
+
2470
+ class CrawlWatcher:
2471
+ """
2472
+ A class to watch and handle crawl job events via WebSocket connection.
2473
+
2474
+ Attributes:
2475
+ id (str): The ID of the crawl job to watch
2476
+ app (FirecrawlApp): The FirecrawlApp instance
2477
+ data (List[Dict[str, Any]]): List of crawled documents/data
2478
+ status (str): Current status of the crawl job
2479
+ ws_url (str): WebSocket URL for the crawl job
2480
+ event_handlers (dict): Dictionary of event type to list of handler functions
2481
+ """
2482
+ def __init__(self, id: str, app: FirecrawlApp):
2483
+ self.id = id
2484
+ self.app = app
2485
+ self.data: List[Dict[str, Any]] = []
2486
+ self.status = "scraping"
2487
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2488
+ self.event_handlers = {
2489
+ 'done': [],
2490
+ 'error': [],
2491
+ 'document': []
2492
+ }
2493
+
2494
+ async def connect(self) -> None:
2495
+ """
2496
+ Establishes WebSocket connection and starts listening for messages.
2497
+ """
2498
+ async with websockets.connect(
2499
+ self.ws_url,
2500
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2501
+ ) as websocket:
2502
+ await self._listen(websocket)
2503
+
2504
+ async def _listen(self, websocket) -> None:
2505
+ """
2506
+ Listens for incoming WebSocket messages and handles them.
2507
+
2508
+ Args:
2509
+ websocket: The WebSocket connection object
2510
+ """
2511
+ async for message in websocket:
2512
+ msg = json.loads(message)
2513
+ await self._handle_message(msg)
2514
+
2515
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2516
+ """
2517
+ Adds an event handler function for a specific event type.
2518
+
2519
+ Args:
2520
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2521
+ handler (Callable): Function to handle the event
2522
+ """
2523
+ if event_type in self.event_handlers:
2524
+ self.event_handlers[event_type].append(handler)
2525
+
2526
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2527
+ """
2528
+ Dispatches an event to all registered handlers for that event type.
2529
+
2530
+ Args:
2531
+ event_type (str): Type of event to dispatch
2532
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2533
+ """
2534
+ if event_type in self.event_handlers:
2535
+ for handler in self.event_handlers[event_type]:
2536
+ handler(detail)
2537
+
2538
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2539
+ """
2540
+ Handles incoming WebSocket messages based on their type.
2541
+
2542
+ Args:
2543
+ msg (Dict[str, Any]): The message to handle
2544
+ """
2545
+ if msg['type'] == 'done':
2546
+ self.status = 'completed'
2547
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2548
+ elif msg['type'] == 'error':
2549
+ self.status = 'failed'
2550
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2551
+ elif msg['type'] == 'catchup':
2552
+ self.status = msg['data']['status']
2553
+ self.data.extend(msg['data'].get('data', []))
2554
+ for doc in self.data:
2555
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2556
+ elif msg['type'] == 'document':
2557
+ self.data.append(msg['data'])
2558
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2559
+
2560
+ class AsyncFirecrawlApp(FirecrawlApp):
2561
+ """
2562
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2563
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2564
+ """
2565
+
2566
+ async def _async_request(
2567
+ self,
2568
+ method: str,
2569
+ url: str,
2570
+ headers: Dict[str, str],
2571
+ data: Optional[Dict[str, Any]] = None,
2572
+ retries: int = 3,
2573
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2574
+ """
2575
+ Generic async request method with exponential backoff retry logic.
2576
+
2577
+ Args:
2578
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2579
+ url (str): The URL to send the request to.
2580
+ headers (Dict[str, str]): Headers to include in the request.
2581
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2582
+ retries (int): Maximum number of retry attempts (default: 3).
2583
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2584
+ Delay will be backoff_factor * (2 ** retry_count).
2585
+
2586
+ Returns:
2587
+ Dict[str, Any]: The parsed JSON response from the server.
2588
+
2589
+ Raises:
2590
+ aiohttp.ClientError: If the request fails after all retries.
2591
+ Exception: If max retries are exceeded or other errors occur.
2592
+ """
2593
+ async with aiohttp.ClientSession() as session:
2594
+ for attempt in range(retries):
2595
+ try:
2596
+ async with session.request(
2597
+ method=method, url=url, headers=headers, json=data
2598
+ ) as response:
2599
+ if response.status == 502:
2600
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2601
+ continue
2602
+ if response.status >= 300:
2603
+ await self._handle_error(response, f"make {method} request")
2604
+ return await response.json()
2605
+ except aiohttp.ClientError as e:
2606
+ if attempt == retries - 1:
2607
+ raise e
2608
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2609
+ raise Exception("Max retries exceeded")
2610
+
2611
+ async def _async_post_request(
2612
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2613
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2614
+ """
2615
+ Make an async POST request with exponential backoff retry logic.
2616
+
2617
+ Args:
2618
+ url (str): The URL to send the POST request to.
2619
+ data (Dict[str, Any]): The JSON data to include in the request body.
2620
+ headers (Dict[str, str]): Headers to include in the request.
2621
+ retries (int): Maximum number of retry attempts (default: 3).
2622
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2623
+ Delay will be backoff_factor * (2 ** retry_count).
2624
+
2625
+ Returns:
2626
+ Dict[str, Any]: The parsed JSON response from the server.
2627
+
2628
+ Raises:
2629
+ aiohttp.ClientError: If the request fails after all retries.
2630
+ Exception: If max retries are exceeded or other errors occur.
2631
+ """
2632
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2633
+
2634
+ async def _async_get_request(
2635
+ self, url: str, headers: Dict[str, str],
2636
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2637
+ """
2638
+ Make an async GET request with exponential backoff retry logic.
2639
+
2640
+ Args:
2641
+ url (str): The URL to send the GET request to.
2642
+ headers (Dict[str, str]): Headers to include in the request.
2643
+ retries (int): Maximum number of retry attempts (default: 3).
2644
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2645
+ Delay will be backoff_factor * (2 ** retry_count).
2646
+
2647
+ Returns:
2648
+ Dict[str, Any]: The parsed JSON response from the server.
2649
+
2650
+ Raises:
2651
+ aiohttp.ClientError: If the request fails after all retries.
2652
+ Exception: If max retries are exceeded or other errors occur.
2653
+ """
2654
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2655
+
2656
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2657
+ """
2658
+ Handle errors from async API responses with detailed error messages.
2659
+
2660
+ Args:
2661
+ response (aiohttp.ClientResponse): The response object from the failed request
2662
+ action (str): Description of the action that was being attempted
2663
+
2664
+ Raises:
2665
+ aiohttp.ClientError: With a detailed error message based on the response status:
2666
+ - 402: Payment Required
2667
+ - 408: Request Timeout
2668
+ - 409: Conflict
2669
+ - 500: Internal Server Error
2670
+ - Other: Unexpected error with status code
2671
+ """
2672
+ try:
2673
+ error_data = await response.json()
2674
+ error_message = error_data.get('error', 'No error message provided.')
2675
+ error_details = error_data.get('details', 'No additional error details provided.')
2676
+ except:
2677
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2678
+
2679
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2680
+
2681
+ raise aiohttp.ClientError(message)
2682
+
2683
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2684
+ """
2685
+ Generate a standardized error message based on HTTP status code for async operations.
2686
+
2687
+ Args:
2688
+ status_code (int): The HTTP status code from the response
2689
+ action (str): Description of the action that was being performed
2690
+ error_message (str): The error message from the API response
2691
+ error_details (str): Additional error details from the API response
2692
+
2693
+ Returns:
2694
+ str: A formatted error message
2695
+ """
2696
+ return self._get_error_message(status_code, action, error_message, error_details)
2697
+
2698
+ async def crawl_url_and_watch(
2699
+ self,
2700
+ url: str,
2701
+ params: Optional[CrawlParams] = None,
2702
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2703
+ """
2704
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2705
+
2706
+ Args:
2707
+ url (str): Target URL to start crawling from
2708
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2709
+ URL Discovery:
2710
+ * includePaths - Patterns of URLs to include
2711
+ * excludePaths - Patterns of URLs to exclude
2712
+ * maxDepth - Maximum crawl depth
2713
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2714
+ * limit - Maximum pages to crawl
2715
+
2716
+ Link Following:
2717
+ * allowBackwardLinks - Follow parent directory links
2718
+ * allowExternalLinks - Follow external domain links
2719
+ * ignoreSitemap - Skip sitemap.xml processing
2720
+
2721
+ Advanced:
2722
+ * scrapeOptions - Page scraping configuration
2723
+ * webhook - Notification webhook settings
2724
+ * deduplicateSimilarURLs - Remove similar URLs
2725
+ * ignoreQueryParameters - Ignore URL parameters
2726
+ * regexOnFullURL - Apply regex to full URLs
2727
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2728
+
2729
+ Returns:
2730
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2731
+
2732
+ Raises:
2733
+ Exception: If crawl job fails to start
2734
+ """
2735
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2736
+ if crawl_response.get('success') and 'id' in crawl_response:
2737
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2738
+ else:
2739
+ raise Exception("Crawl job failed to start")
2740
+
2741
+ async def batch_scrape_urls_and_watch(
2742
+ self,
2743
+ urls: List[str],
2744
+ params: Optional[ScrapeParams] = None,
2745
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2746
+ """
2747
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2748
+
2749
+ Args:
2750
+ urls (List[str]): List of URLs to scrape
2751
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2752
+
2753
+ Content Options:
2754
+ * formats - Content formats to retrieve
2755
+ * includeTags - HTML tags to include
2756
+ * excludeTags - HTML tags to exclude
2757
+ * onlyMainContent - Extract main content only
2758
+
2759
+ Request Options:
2760
+ * headers - Custom HTTP headers
2761
+ * timeout - Request timeout (ms)
2762
+ * mobile - Use mobile user agent
2763
+ * proxy - Proxy type
2764
+
2765
+ Extraction Options:
2766
+ * extract - Content extraction config
2767
+ * jsonOptions - JSON extraction config
2768
+ * actions - Actions to perform
2769
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2770
+
2771
+ Returns:
2772
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2773
+
2774
+ Raises:
2775
+ Exception: If batch scrape job fails to start
2776
+ """
2777
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2778
+ if batch_response.get('success') and 'id' in batch_response:
2779
+ return AsyncCrawlWatcher(batch_response['id'], self)
2780
+ else:
2781
+ raise Exception("Batch scrape job failed to start")
2782
+
2783
+ async def scrape_url(
2784
+ self,
2785
+ url: str,
2786
+ *,
2787
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2788
+ include_tags: Optional[List[str]] = None,
2789
+ exclude_tags: Optional[List[str]] = None,
2790
+ only_main_content: Optional[bool] = None,
2791
+ wait_for: Optional[int] = None,
2792
+ timeout: Optional[int] = None,
2793
+ location: Optional[LocationConfig] = None,
2794
+ mobile: Optional[bool] = None,
2795
+ skip_tls_verification: Optional[bool] = None,
2796
+ remove_base64_images: Optional[bool] = None,
2797
+ block_ads: Optional[bool] = None,
2798
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2799
+ extract: Optional[JsonConfig] = None,
2800
+ json_options: Optional[JsonConfig] = None,
2801
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2802
+ **kwargs) -> ScrapeResponse[Any]:
2803
+ """
2804
+ Scrape a single URL asynchronously.
2805
+
2806
+ Args:
2807
+ url (str): Target URL to scrape
2808
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2809
+ include_tags (Optional[List[str]]): HTML tags to include
2810
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2811
+ only_main_content (Optional[bool]): Extract main content only
2812
+ wait_for (Optional[int]): Wait for a specific element to appear
2813
+ timeout (Optional[int]): Request timeout (ms)
2814
+ location (Optional[LocationConfig]): Location configuration
2815
+ mobile (Optional[bool]): Use mobile user agent
2816
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2817
+ remove_base64_images (Optional[bool]): Remove base64 images
2818
+ block_ads (Optional[bool]): Block ads
2819
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2820
+ extract (Optional[JsonConfig]): Content extraction settings
2821
+ json_options (Optional[JsonConfig]): JSON extraction settings
2822
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2823
+ **kwargs: Additional parameters to pass to the API
2824
+
2825
+ Returns:
2826
+ ScrapeResponse with:
2827
+ * success - Whether scrape was successful
2828
+ * markdown - Markdown content if requested
2829
+ * html - HTML content if requested
2830
+ * rawHtml - Raw HTML content if requested
2831
+ * links - Extracted links if requested
2832
+ * screenshot - Screenshot if requested
2833
+ * extract - Extracted data if requested
2834
+ * json - JSON data if requested
2835
+ * error - Error message if scrape failed
2836
+
2837
+ Raises:
2838
+ Exception: If scraping fails
2839
+ """
2840
+ # Validate any additional kwargs
2841
+ self._validate_kwargs(kwargs, "scrape_url")
2842
+
2843
+ headers = self._prepare_headers()
2844
+
2845
+ # Build scrape parameters
2846
+ scrape_params = {
2847
+ 'url': url,
2848
+ 'origin': f"python-sdk@{version}"
2849
+ }
2850
+
2851
+ # Add optional parameters if provided and not None
2852
+ if formats:
2853
+ scrape_params['formats'] = formats
2854
+ if include_tags:
2855
+ scrape_params['includeTags'] = include_tags
2856
+ if exclude_tags:
2857
+ scrape_params['excludeTags'] = exclude_tags
2858
+ if only_main_content is not None:
2859
+ scrape_params['onlyMainContent'] = only_main_content
2860
+ if wait_for:
2861
+ scrape_params['waitFor'] = wait_for
2862
+ if timeout:
2863
+ scrape_params['timeout'] = timeout
2864
+ if location:
2865
+ scrape_params['location'] = location.dict(exclude_none=True)
2866
+ if mobile is not None:
2867
+ scrape_params['mobile'] = mobile
2868
+ if skip_tls_verification is not None:
2869
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2870
+ if remove_base64_images is not None:
2871
+ scrape_params['removeBase64Images'] = remove_base64_images
2872
+ if block_ads is not None:
2873
+ scrape_params['blockAds'] = block_ads
2874
+ if proxy:
2875
+ scrape_params['proxy'] = proxy
2876
+ if extract:
2877
+ extract_dict = extract.dict(exclude_none=True)
2878
+ if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
2879
+ extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
2880
+ scrape_params['extract'] = extract_dict
2881
+ if json_options:
2882
+ json_options_dict = json_options.dict(exclude_none=True)
2883
+ if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
2884
+ json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
2885
+ scrape_params['jsonOptions'] = json_options_dict
2886
+ if actions:
2887
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2888
+
2889
+ # Make async request
2890
+ endpoint = f'/v1/scrape'
2891
+ response = await self._async_post_request(
2892
+ f'{self.api_url}{endpoint}',
2893
+ scrape_params,
2894
+ headers
2895
+ )
2896
+
2897
+ if response.get('success') and 'data' in response:
2898
+ return ScrapeResponse(**response['data'])
2899
+ elif "error" in response:
2900
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2901
+ else:
2902
+ # Use the response content directly if possible, otherwise a generic message
2903
+ error_content = response.get('error', str(response))
2904
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
2905
+
2906
+ async def batch_scrape_urls(
2907
+ self,
2908
+ urls: List[str],
2909
+ *,
2910
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2911
+ headers: Optional[Dict[str, str]] = None,
2912
+ include_tags: Optional[List[str]] = None,
2913
+ exclude_tags: Optional[List[str]] = None,
2914
+ only_main_content: Optional[bool] = None,
2915
+ wait_for: Optional[int] = None,
2916
+ timeout: Optional[int] = None,
2917
+ location: Optional[LocationConfig] = None,
2918
+ mobile: Optional[bool] = None,
2919
+ skip_tls_verification: Optional[bool] = None,
2920
+ remove_base64_images: Optional[bool] = None,
2921
+ block_ads: Optional[bool] = None,
2922
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2923
+ extract: Optional[JsonConfig] = None,
2924
+ json_options: Optional[JsonConfig] = None,
2925
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2926
+ agent: Optional[AgentOptions] = None,
2927
+ poll_interval: Optional[int] = 2,
2928
+ idempotency_key: Optional[str] = None,
2929
+ **kwargs
2930
+ ) -> BatchScrapeStatusResponse:
2931
+ """
2932
+ Asynchronously scrape multiple URLs and monitor until completion.
2933
+
2934
+ Args:
2935
+ urls (List[str]): URLs to scrape
2936
+ formats (Optional[List[Literal]]): Content formats to retrieve
2937
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2938
+ include_tags (Optional[List[str]]): HTML tags to include
2939
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2940
+ only_main_content (Optional[bool]): Extract main content only
2941
+ wait_for (Optional[int]): Wait time in milliseconds
2942
+ timeout (Optional[int]): Request timeout in milliseconds
2943
+ location (Optional[LocationConfig]): Location configuration
2944
+ mobile (Optional[bool]): Use mobile user agent
2945
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2946
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2947
+ block_ads (Optional[bool]): Block advertisements
2948
+ proxy (Optional[Literal]): Proxy type to use
2949
+ extract (Optional[JsonConfig]): Content extraction config
2950
+ json_options (Optional[JsonConfig]): JSON extraction config
2951
+ actions (Optional[List[Union]]): Actions to perform
2952
+ agent (Optional[AgentOptions]): Agent configuration
2953
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
2954
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2955
+ **kwargs: Additional parameters to pass to the API
2956
+
2957
+ Returns:
2958
+ BatchScrapeStatusResponse with:
2959
+ * Scraping status and progress
2960
+ * Scraped content for each URL
2961
+ * Success/error information
2962
+
2963
+ Raises:
2964
+ Exception: If batch scrape fails
2965
+ """
2966
+ # Validate any additional kwargs
2967
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
2968
+
2969
+ scrape_params = {}
2970
+
2971
+ # Add individual parameters
2972
+ if formats is not None:
2973
+ scrape_params['formats'] = formats
2974
+ if headers is not None:
2975
+ scrape_params['headers'] = headers
2976
+ if include_tags is not None:
2977
+ scrape_params['includeTags'] = include_tags
2978
+ if exclude_tags is not None:
2979
+ scrape_params['excludeTags'] = exclude_tags
2980
+ if only_main_content is not None:
2981
+ scrape_params['onlyMainContent'] = only_main_content
2982
+ if wait_for is not None:
2983
+ scrape_params['waitFor'] = wait_for
2984
+ if timeout is not None:
2985
+ scrape_params['timeout'] = timeout
2986
+ if location is not None:
2987
+ scrape_params['location'] = location.dict(exclude_none=True)
2988
+ if mobile is not None:
2989
+ scrape_params['mobile'] = mobile
2990
+ if skip_tls_verification is not None:
2991
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2992
+ if remove_base64_images is not None:
2993
+ scrape_params['removeBase64Images'] = remove_base64_images
2994
+ if block_ads is not None:
2995
+ scrape_params['blockAds'] = block_ads
2996
+ if proxy is not None:
2997
+ scrape_params['proxy'] = proxy
2998
+ if extract is not None:
2999
+ if hasattr(extract.schema, 'schema'):
3000
+ extract.schema = extract.schema.schema()
3001
+ scrape_params['extract'] = extract.dict(exclude_none=True)
3002
+ if json_options is not None:
3003
+ if hasattr(json_options.schema, 'schema'):
3004
+ json_options.schema = json_options.schema.schema()
3005
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
3006
+ if actions is not None:
3007
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3008
+ if agent is not None:
3009
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3010
+
3011
+ # Add any additional kwargs
3012
+ scrape_params.update(kwargs)
3013
+
3014
+ # Create final params object
3015
+ final_params = ScrapeParams(**scrape_params)
3016
+ params_dict = final_params.dict(exclude_none=True)
3017
+ params_dict['urls'] = urls
3018
+ params_dict['origin'] = f"python-sdk@{version}"
3019
+
3020
+ # Make request
3021
+ headers = self._prepare_headers(idempotency_key)
3022
+ response = await self._async_post_request(
3023
+ f'{self.api_url}/v1/batch/scrape',
3024
+ params_dict,
3025
+ headers
3026
+ )
3027
+
3028
+ if response.get('success'):
3029
+ try:
3030
+ id = response.get('id')
3031
+ except:
3032
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3033
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3034
+ else:
3035
+ self._handle_error(response, 'start batch scrape job')
3036
+
3037
+
3038
+ async def async_batch_scrape_urls(
3039
+ self,
3040
+ urls: List[str],
3041
+ *,
3042
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3043
+ headers: Optional[Dict[str, str]] = None,
3044
+ include_tags: Optional[List[str]] = None,
3045
+ exclude_tags: Optional[List[str]] = None,
3046
+ only_main_content: Optional[bool] = None,
3047
+ wait_for: Optional[int] = None,
3048
+ timeout: Optional[int] = None,
3049
+ location: Optional[LocationConfig] = None,
3050
+ mobile: Optional[bool] = None,
3051
+ skip_tls_verification: Optional[bool] = None,
3052
+ remove_base64_images: Optional[bool] = None,
3053
+ block_ads: Optional[bool] = None,
3054
+ proxy: Optional[Literal["basic", "stealth"]] = None,
3055
+ extract: Optional[JsonConfig] = None,
3056
+ json_options: Optional[JsonConfig] = None,
3057
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3058
+ agent: Optional[AgentOptions] = None,
3059
+ idempotency_key: Optional[str] = None,
3060
+ **kwargs
3061
+ ) -> BatchScrapeResponse:
3062
+ """
3063
+ Initiate a batch scrape job asynchronously.
3064
+
3065
+ Args:
3066
+ urls (List[str]): URLs to scrape
3067
+ formats (Optional[List[Literal]]): Content formats to retrieve
3068
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3069
+ include_tags (Optional[List[str]]): HTML tags to include
3070
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3071
+ only_main_content (Optional[bool]): Extract main content only
3072
+ wait_for (Optional[int]): Wait time in milliseconds
3073
+ timeout (Optional[int]): Request timeout in milliseconds
3074
+ location (Optional[LocationConfig]): Location configuration
3075
+ mobile (Optional[bool]): Use mobile user agent
3076
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3077
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3078
+ block_ads (Optional[bool]): Block advertisements
3079
+ proxy (Optional[Literal]): Proxy type to use
3080
+ extract (Optional[JsonConfig]): Content extraction config
3081
+ json_options (Optional[JsonConfig]): JSON extraction config
3082
+ actions (Optional[List[Union]]): Actions to perform
3083
+ agent (Optional[AgentOptions]): Agent configuration
3084
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3085
+ **kwargs: Additional parameters to pass to the API
3086
+
3087
+ Returns:
3088
+ BatchScrapeResponse with:
3089
+ * success - Whether job started successfully
3090
+ * id - Unique identifier for the job
3091
+ * url - Status check URL
3092
+ * error - Error message if start failed
3093
+
3094
+ Raises:
3095
+ Exception: If job initiation fails
3096
+ """
3097
+ # Validate any additional kwargs
3098
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3099
+
3100
+ scrape_params = {}
3101
+
3102
+ # Add individual parameters
3103
+ if formats is not None:
3104
+ scrape_params['formats'] = formats
3105
+ if headers is not None:
3106
+ scrape_params['headers'] = headers
3107
+ if include_tags is not None:
3108
+ scrape_params['includeTags'] = include_tags
3109
+ if exclude_tags is not None:
3110
+ scrape_params['excludeTags'] = exclude_tags
3111
+ if only_main_content is not None:
3112
+ scrape_params['onlyMainContent'] = only_main_content
3113
+ if wait_for is not None:
3114
+ scrape_params['waitFor'] = wait_for
3115
+ if timeout is not None:
3116
+ scrape_params['timeout'] = timeout
3117
+ if location is not None:
3118
+ scrape_params['location'] = location.dict(exclude_none=True)
3119
+ if mobile is not None:
3120
+ scrape_params['mobile'] = mobile
3121
+ if skip_tls_verification is not None:
3122
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3123
+ if remove_base64_images is not None:
3124
+ scrape_params['removeBase64Images'] = remove_base64_images
3125
+ if block_ads is not None:
3126
+ scrape_params['blockAds'] = block_ads
3127
+ if proxy is not None:
3128
+ scrape_params['proxy'] = proxy
3129
+ if extract is not None:
3130
+ if hasattr(extract.schema, 'schema'):
3131
+ extract.schema = extract.schema.schema()
3132
+ scrape_params['extract'] = extract.dict(exclude_none=True)
3133
+ if json_options is not None:
3134
+ if hasattr(json_options.schema, 'schema'):
3135
+ json_options.schema = json_options.schema.schema()
3136
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
3137
+ if actions is not None:
3138
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3139
+ if agent is not None:
3140
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3141
+
3142
+ # Add any additional kwargs
3143
+ scrape_params.update(kwargs)
3144
+
3145
+ # Create final params object
3146
+ final_params = ScrapeParams(**scrape_params)
3147
+ params_dict = final_params.dict(exclude_none=True)
3148
+ params_dict['urls'] = urls
3149
+ params_dict['origin'] = f"python-sdk@{version}"
3150
+
3151
+ # Make request
3152
+ headers = self._prepare_headers(idempotency_key)
3153
+ response = await self._async_post_request(
3154
+ f'{self.api_url}/v1/batch/scrape',
3155
+ params_dict,
3156
+ headers
3157
+ )
3158
+
3159
+ if response.get('status_code') == 200:
3160
+ try:
3161
+ return BatchScrapeResponse(**response.json())
3162
+ except:
3163
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3164
+ else:
3165
+ self._handle_error(response, 'start batch scrape job')
3166
+
3167
+ async def crawl_url(
3168
+ self,
3169
+ url: str,
3170
+ *,
3171
+ include_paths: Optional[List[str]] = None,
3172
+ exclude_paths: Optional[List[str]] = None,
3173
+ max_depth: Optional[int] = None,
3174
+ max_discovery_depth: Optional[int] = None,
3175
+ limit: Optional[int] = None,
3176
+ allow_backward_links: Optional[bool] = None,
3177
+ allow_external_links: Optional[bool] = None,
3178
+ ignore_sitemap: Optional[bool] = None,
3179
+ scrape_options: Optional[ScrapeOptions] = None,
3180
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3181
+ deduplicate_similar_urls: Optional[bool] = None,
3182
+ ignore_query_parameters: Optional[bool] = None,
3183
+ regex_on_full_url: Optional[bool] = None,
3184
+ poll_interval: Optional[int] = 2,
3185
+ idempotency_key: Optional[str] = None,
3186
+ **kwargs
3187
+ ) -> CrawlStatusResponse:
3188
+ """
3189
+ Crawl a website starting from a URL.
3190
+
3191
+ Args:
3192
+ url (str): Target URL to start crawling from
3193
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3194
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3195
+ max_depth (Optional[int]): Maximum crawl depth
3196
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3197
+ limit (Optional[int]): Maximum pages to crawl
3198
+ allow_backward_links (Optional[bool]): Follow parent directory links
3199
+ allow_external_links (Optional[bool]): Follow external domain links
3200
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3201
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3202
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3203
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3204
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3205
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3206
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3207
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3208
+ **kwargs: Additional parameters to pass to the API
3209
+
3210
+ Returns:
3211
+ CrawlStatusResponse with:
3212
+ * Crawling status and progress
3213
+ * Crawled page contents
3214
+ * Success/error information
3215
+
3216
+ Raises:
3217
+ Exception: If crawl fails
3218
+ """
3219
+ # Validate any additional kwargs
3220
+ self._validate_kwargs(kwargs, "crawl_url")
3221
+
3222
+ crawl_params = {}
3223
+
3224
+ # Add individual parameters
3225
+ if include_paths is not None:
3226
+ crawl_params['includePaths'] = include_paths
3227
+ if exclude_paths is not None:
3228
+ crawl_params['excludePaths'] = exclude_paths
3229
+ if max_depth is not None:
3230
+ crawl_params['maxDepth'] = max_depth
3231
+ if max_discovery_depth is not None:
3232
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3233
+ if limit is not None:
3234
+ crawl_params['limit'] = limit
3235
+ if allow_backward_links is not None:
3236
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3237
+ if allow_external_links is not None:
3238
+ crawl_params['allowExternalLinks'] = allow_external_links
3239
+ if ignore_sitemap is not None:
3240
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3241
+ if scrape_options is not None:
3242
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3243
+ if webhook is not None:
3244
+ crawl_params['webhook'] = webhook
3245
+ if deduplicate_similar_urls is not None:
3246
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3247
+ if ignore_query_parameters is not None:
3248
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3249
+ if regex_on_full_url is not None:
3250
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3251
+
3252
+ # Add any additional kwargs
3253
+ crawl_params.update(kwargs)
3254
+
3255
+ # Create final params object
3256
+ final_params = CrawlParams(**crawl_params)
3257
+ params_dict = final_params.dict(exclude_none=True)
3258
+ params_dict['url'] = url
3259
+ params_dict['origin'] = f"python-sdk@{version}"
3260
+ # Make request
3261
+ headers = self._prepare_headers(idempotency_key)
3262
+ response = await self._async_post_request(
3263
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3264
+
3265
+ if response.get('success'):
3266
+ try:
3267
+ id = response.get('id')
3268
+ except:
3269
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3270
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3271
+ else:
3272
+ self._handle_error(response, 'start crawl job')
3273
+
3274
+
3275
+ async def async_crawl_url(
3276
+ self,
3277
+ url: str,
3278
+ *,
3279
+ include_paths: Optional[List[str]] = None,
3280
+ exclude_paths: Optional[List[str]] = None,
3281
+ max_depth: Optional[int] = None,
3282
+ max_discovery_depth: Optional[int] = None,
3283
+ limit: Optional[int] = None,
3284
+ allow_backward_links: Optional[bool] = None,
3285
+ allow_external_links: Optional[bool] = None,
3286
+ ignore_sitemap: Optional[bool] = None,
3287
+ scrape_options: Optional[ScrapeOptions] = None,
3288
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3289
+ deduplicate_similar_urls: Optional[bool] = None,
3290
+ ignore_query_parameters: Optional[bool] = None,
3291
+ regex_on_full_url: Optional[bool] = None,
3292
+ poll_interval: Optional[int] = 2,
3293
+ idempotency_key: Optional[str] = None,
3294
+ **kwargs
3295
+ ) -> CrawlResponse:
3296
+ """
3297
+ Start an asynchronous crawl job.
3298
+
3299
+ Args:
3300
+ url (str): Target URL to start crawling from
3301
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3302
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3303
+ max_depth (Optional[int]): Maximum crawl depth
3304
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3305
+ limit (Optional[int]): Maximum pages to crawl
3306
+ allow_backward_links (Optional[bool]): Follow parent directory links
3307
+ allow_external_links (Optional[bool]): Follow external domain links
3308
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3309
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3310
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3311
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3312
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3313
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3314
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3315
+ **kwargs: Additional parameters to pass to the API
3316
+
3317
+ Returns:
3318
+ CrawlResponse with:
3319
+ * success - Whether crawl started successfully
3320
+ * id - Unique identifier for the crawl job
3321
+ * url - Status check URL for the crawl
3322
+ * error - Error message if start failed
3323
+
3324
+ Raises:
3325
+ Exception: If crawl initiation fails
3326
+ """
3327
+ crawl_params = {}
3328
+
3329
+ # Add individual parameters
3330
+ if include_paths is not None:
3331
+ crawl_params['includePaths'] = include_paths
3332
+ if exclude_paths is not None:
3333
+ crawl_params['excludePaths'] = exclude_paths
3334
+ if max_depth is not None:
3335
+ crawl_params['maxDepth'] = max_depth
3336
+ if max_discovery_depth is not None:
3337
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3338
+ if limit is not None:
3339
+ crawl_params['limit'] = limit
3340
+ if allow_backward_links is not None:
3341
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3342
+ if allow_external_links is not None:
3343
+ crawl_params['allowExternalLinks'] = allow_external_links
3344
+ if ignore_sitemap is not None:
3345
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3346
+ if scrape_options is not None:
3347
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3348
+ if webhook is not None:
3349
+ crawl_params['webhook'] = webhook
3350
+ if deduplicate_similar_urls is not None:
3351
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3352
+ if ignore_query_parameters is not None:
3353
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3354
+ if regex_on_full_url is not None:
3355
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3356
+
3357
+ # Add any additional kwargs
3358
+ crawl_params.update(kwargs)
3359
+
3360
+ # Create final params object
3361
+ final_params = CrawlParams(**crawl_params)
3362
+ params_dict = final_params.dict(exclude_none=True)
3363
+ params_dict['url'] = url
3364
+ params_dict['origin'] = f"python-sdk@{version}"
3365
+
3366
+ # Make request
3367
+ headers = self._prepare_headers(idempotency_key)
3368
+ response = await self._async_post_request(
3369
+ f'{self.api_url}/v1/crawl',
3370
+ params_dict,
3371
+ headers
3372
+ )
3373
+
3374
+ if response.get('success'):
3375
+ try:
3376
+ return CrawlResponse(**response)
3377
+ except:
3378
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3379
+ else:
3380
+ self._handle_error(response, 'start crawl job')
3381
+
3382
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3383
+ """
3384
+ Check the status and results of an asynchronous crawl job.
3385
+
3386
+ Args:
3387
+ id (str): Unique identifier for the crawl job
3388
+
3389
+ Returns:
3390
+ CrawlStatusResponse containing:
3391
+ Status Information:
3392
+ * status - Current state (scraping/completed/failed/cancelled)
3393
+ * completed - Number of pages crawled
3394
+ * total - Total pages to crawl
3395
+ * creditsUsed - API credits consumed
3396
+ * expiresAt - Data expiration timestamp
3397
+
3398
+ Results:
3399
+ * data - List of crawled documents
3400
+ * next - URL for next page of results (if paginated)
3401
+ * success - Whether status check succeeded
3402
+ * error - Error message if failed
3403
+
3404
+ Raises:
3405
+ Exception: If status check fails
3406
+ """
3407
+ headers = self._prepare_headers()
3408
+ endpoint = f'/v1/crawl/{id}'
3409
+
3410
+ status_data = await self._async_get_request(
3411
+ f'{self.api_url}{endpoint}',
3412
+ headers
3413
+ )
3414
+
3415
+ if status_data.get('status') == 'completed':
3416
+ if 'data' in status_data:
3417
+ data = status_data['data']
3418
+ while 'next' in status_data:
3419
+ if len(status_data['data']) == 0:
3420
+ break
3421
+ next_url = status_data.get('next')
3422
+ if not next_url:
3423
+ logger.warning("Expected 'next' URL is missing.")
3424
+ break
3425
+ next_data = await self._async_get_request(next_url, headers)
3426
+ data.extend(next_data.get('data', []))
3427
+ status_data = next_data
3428
+ status_data['data'] = data
3429
+ # Create CrawlStatusResponse object from status data
3430
+ response = CrawlStatusResponse(
3431
+ status=status_data.get('status'),
3432
+ total=status_data.get('total'),
3433
+ completed=status_data.get('completed'),
3434
+ creditsUsed=status_data.get('creditsUsed'),
3435
+ expiresAt=status_data.get('expiresAt'),
3436
+ data=status_data.get('data'),
3437
+ success=False if 'error' in status_data else True
3438
+ )
3439
+
3440
+ if 'error' in status_data:
3441
+ response.error = status_data.get('error')
3442
+
3443
+ if 'next' in status_data:
3444
+ response.next = status_data.get('next')
3445
+
3446
+ return response
3447
+
3448
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3449
+ """
3450
+ Monitor the status of an asynchronous job until completion.
3451
+
3452
+ Args:
3453
+ id (str): The ID of the job to monitor
3454
+ headers (Dict[str, str]): Headers to include in status check requests
3455
+ poll_interval (int): Seconds between status checks (default: 2)
3456
+
3457
+ Returns:
3458
+ CrawlStatusResponse: The job results if completed successfully
3459
+
3460
+ Raises:
3461
+ Exception: If the job fails or an error occurs during status checks
3462
+ """
3463
+ while True:
3464
+ status_data = await self._async_get_request(
3465
+ f'{self.api_url}/v1/crawl/{id}',
3466
+ headers
3467
+ )
3468
+
3469
+ if status_data.get('status') == 'completed':
3470
+ if 'data' in status_data:
3471
+ data = status_data['data']
3472
+ while 'next' in status_data:
3473
+ if len(status_data['data']) == 0:
3474
+ break
3475
+ next_url = status_data.get('next')
3476
+ if not next_url:
3477
+ logger.warning("Expected 'next' URL is missing.")
3478
+ break
3479
+ next_data = await self._async_get_request(next_url, headers)
3480
+ data.extend(next_data.get('data', []))
3481
+ status_data = next_data
3482
+ status_data['data'] = data
3483
+ return CrawlStatusResponse(**status_data)
3484
+ else:
3485
+ raise Exception('Job completed but no data was returned')
3486
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3487
+ await asyncio.sleep(max(poll_interval, 2))
3488
+ else:
3489
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3490
+
3491
+ async def map_url(
3492
+ self,
3493
+ url: str,
3494
+ *,
3495
+ search: Optional[str] = None,
3496
+ ignore_sitemap: Optional[bool] = None,
3497
+ include_subdomains: Optional[bool] = None,
3498
+ sitemap_only: Optional[bool] = None,
3499
+ limit: Optional[int] = None,
3500
+ timeout: Optional[int] = None,
3501
+ params: Optional[MapParams] = None) -> MapResponse:
3502
+ """
3503
+ Asynchronously map and discover links from a URL.
3504
+
3505
+ Args:
3506
+ url (str): Target URL to map
3507
+ params (Optional[MapParams]): See MapParams model:
3508
+ Discovery Options:
3509
+ * search - Filter pattern for URLs
3510
+ * ignoreSitemap - Skip sitemap.xml
3511
+ * includeSubdomains - Include subdomain links
3512
+ * sitemapOnly - Only use sitemap.xml
3513
+
3514
+ Limits:
3515
+ * limit - Max URLs to return
3516
+ * timeout - Request timeout (ms)
3517
+
3518
+ Returns:
3519
+ MapResponse with:
3520
+ * Discovered URLs
3521
+ * Success/error status
3522
+
3523
+ Raises:
3524
+ Exception: If mapping fails
3525
+ """
3526
+ map_params = {}
3527
+ if params:
3528
+ map_params.update(params.dict(exclude_none=True))
3529
+
3530
+ # Add individual parameters
3531
+ if search is not None:
3532
+ map_params['search'] = search
3533
+ if ignore_sitemap is not None:
3534
+ map_params['ignoreSitemap'] = ignore_sitemap
3535
+ if include_subdomains is not None:
3536
+ map_params['includeSubdomains'] = include_subdomains
3537
+ if sitemap_only is not None:
3538
+ map_params['sitemapOnly'] = sitemap_only
3539
+ if limit is not None:
3540
+ map_params['limit'] = limit
3541
+ if timeout is not None:
3542
+ map_params['timeout'] = timeout
3543
+
3544
+ # Create final params object
3545
+ final_params = MapParams(**map_params)
3546
+ params_dict = final_params.dict(exclude_none=True)
3547
+ params_dict['url'] = url
3548
+ params_dict['origin'] = f"python-sdk@{version}"
3549
+
3550
+ # Make request
3551
+ endpoint = f'/v1/map'
3552
+ response = await self._async_post_request(
3553
+ f'{self.api_url}{endpoint}',
3554
+ params_dict,
3555
+ headers={"Authorization": f"Bearer {self.api_key}"}
3556
+ )
3557
+
3558
+ if response.get('success') and 'links' in response:
3559
+ return MapResponse(**response)
3560
+ elif 'error' in response:
3561
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3562
+ else:
3563
+ raise Exception(f'Failed to map URL. Error: {response}')
3564
+
3565
+ async def extract(
3566
+ self,
3567
+ urls: Optional[List[str]] = None,
3568
+ *,
3569
+ prompt: Optional[str] = None,
3570
+ schema: Optional[Any] = None,
3571
+ system_prompt: Optional[str] = None,
3572
+ allow_external_links: Optional[bool] = False,
3573
+ enable_web_search: Optional[bool] = False,
3574
+ show_sources: Optional[bool] = False,
3575
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3576
+
3577
+ """
3578
+ Asynchronously extract structured information from URLs.
3579
+
3580
+ Args:
3581
+ urls (Optional[List[str]]): URLs to extract from
3582
+ prompt (Optional[str]): Custom extraction prompt
3583
+ schema (Optional[Any]): JSON schema/Pydantic model
3584
+ system_prompt (Optional[str]): System context
3585
+ allow_external_links (Optional[bool]): Follow external links
3586
+ enable_web_search (Optional[bool]): Enable web search
3587
+ show_sources (Optional[bool]): Include source URLs
3588
+ agent (Optional[Dict[str, Any]]): Agent configuration
3589
+
3590
+ Returns:
3591
+ ExtractResponse with:
3592
+ * Structured data matching schema
3593
+ * Source information if requested
3594
+ * Success/error status
3595
+
3596
+ Raises:
3597
+ ValueError: If prompt/schema missing or extraction fails
3598
+ """
3599
+ headers = self._prepare_headers()
3600
+
3601
+ if not prompt and not schema:
3602
+ raise ValueError("Either prompt or schema is required")
3603
+
3604
+ if not urls and not prompt:
3605
+ raise ValueError("Either urls or prompt is required")
3606
+
3607
+ if schema:
3608
+ if hasattr(schema, 'model_json_schema'):
3609
+ # Convert Pydantic model to JSON schema
3610
+ schema = schema.model_json_schema()
3611
+ # Otherwise assume it's already a JSON schema dict
3612
+
3613
+ request_data = {
3614
+ 'urls': urls or [],
3615
+ 'allowExternalLinks': allow_external_links,
3616
+ 'enableWebSearch': enable_web_search,
3617
+ 'showSources': show_sources,
3618
+ 'schema': schema,
3619
+ 'origin': f'python-sdk@{get_version()}'
3620
+ }
3621
+
3622
+ # Only add prompt and systemPrompt if they exist
3623
+ if prompt:
3624
+ request_data['prompt'] = prompt
3625
+ if system_prompt:
3626
+ request_data['systemPrompt'] = system_prompt
3627
+
3628
+ if agent:
3629
+ request_data['agent'] = agent
3630
+
3631
+ response = await self._async_post_request(
3632
+ f'{self.api_url}/v1/extract',
3633
+ request_data,
3634
+ headers
3635
+ )
3636
+
3637
+ if response.get('success'):
3638
+ job_id = response.get('id')
3639
+ if not job_id:
3640
+ raise Exception('Job ID not returned from extract request.')
3641
+
3642
+ while True:
3643
+ status_data = await self._async_get_request(
3644
+ f'{self.api_url}/v1/extract/{job_id}',
3645
+ headers
3646
+ )
3647
+
3648
+ if status_data['status'] == 'completed':
3649
+ return ExtractResponse(**status_data)
3650
+ elif status_data['status'] in ['failed', 'cancelled']:
3651
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3652
+
3653
+ await asyncio.sleep(2)
3654
+ else:
3655
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3656
+
3657
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3658
+ """
3659
+ Check the status of an asynchronous batch scrape job.
3660
+
3661
+ Args:
3662
+ id (str): The ID of the batch scrape job
3663
+
3664
+ Returns:
3665
+ BatchScrapeStatusResponse containing:
3666
+ Status Information:
3667
+ * status - Current state (scraping/completed/failed/cancelled)
3668
+ * completed - Number of URLs scraped
3669
+ * total - Total URLs to scrape
3670
+ * creditsUsed - API credits consumed
3671
+ * expiresAt - Data expiration timestamp
3672
+
3673
+ Results:
3674
+ * data - List of scraped documents
3675
+ * next - URL for next page of results (if paginated)
3676
+ * success - Whether status check succeeded
3677
+ * error - Error message if failed
3678
+
3679
+ Raises:
3680
+ Exception: If status check fails
3681
+ """
3682
+ headers = self._prepare_headers()
3683
+ endpoint = f'/v1/batch/scrape/{id}'
3684
+
3685
+ status_data = await self._async_get_request(
3686
+ f'{self.api_url}{endpoint}',
3687
+ headers
3688
+ )
3689
+
3690
+ if status_data['status'] == 'completed':
3691
+ if 'data' in status_data:
3692
+ data = status_data['data']
3693
+ while 'next' in status_data:
3694
+ if len(status_data['data']) == 0:
3695
+ break
3696
+ next_url = status_data.get('next')
3697
+ if not next_url:
3698
+ logger.warning("Expected 'next' URL is missing.")
3699
+ break
3700
+ next_data = await self._async_get_request(next_url, headers)
3701
+ data.extend(next_data.get('data', []))
3702
+ status_data = next_data
3703
+ status_data['data'] = data
3704
+
3705
+ response = BatchScrapeStatusResponse(
3706
+ status=status_data.get('status'),
3707
+ total=status_data.get('total'),
3708
+ completed=status_data.get('completed'),
3709
+ creditsUsed=status_data.get('creditsUsed'),
3710
+ expiresAt=status_data.get('expiresAt'),
3711
+ data=status_data.get('data')
3712
+ )
3713
+
3714
+ if 'error' in status_data:
3715
+ response['error'] = status_data['error']
3716
+
3717
+ if 'next' in status_data:
3718
+ response['next'] = status_data['next']
3719
+
3720
+ return {
3721
+ 'success': False if 'error' in status_data else True,
3722
+ **response
3723
+ }
3724
+
3725
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3726
+ """
3727
+ Get information about errors from an asynchronous batch scrape job.
3728
+
3729
+ Args:
3730
+ id (str): The ID of the batch scrape job
3731
+
3732
+ Returns:
3733
+ CrawlErrorsResponse containing:
3734
+ errors (List[Dict[str, str]]): List of errors with fields:
3735
+ * id (str): Error ID
3736
+ * timestamp (str): When the error occurred
3737
+ * url (str): URL that caused the error
3738
+ * error (str): Error message
3739
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3740
+
3741
+ Raises:
3742
+ Exception: If error check fails
3743
+ """
3744
+ headers = self._prepare_headers()
3745
+ return await self._async_get_request(
3746
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3747
+ headers
3748
+ )
3749
+
3750
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3751
+ """
3752
+ Get information about errors from an asynchronous crawl job.
3753
+
3754
+ Args:
3755
+ id (str): The ID of the crawl job
3756
+
3757
+ Returns:
3758
+ CrawlErrorsResponse containing:
3759
+ * errors (List[Dict[str, str]]): List of errors with fields:
3760
+ - id (str): Error ID
3761
+ - timestamp (str): When the error occurred
3762
+ - url (str): URL that caused the error
3763
+ - error (str): Error message
3764
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3765
+
3766
+ Raises:
3767
+ Exception: If error check fails
3768
+ """
3769
+ headers = self._prepare_headers()
3770
+ return await self._async_get_request(
3771
+ f'{self.api_url}/v1/crawl/{id}/errors',
3772
+ headers
3773
+ )
3774
+
3775
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3776
+ """
3777
+ Cancel an asynchronous crawl job.
3778
+
3779
+ Args:
3780
+ id (str): The ID of the crawl job to cancel
3781
+
3782
+ Returns:
3783
+ Dict[str, Any] containing:
3784
+ * success (bool): Whether cancellation was successful
3785
+ * error (str, optional): Error message if cancellation failed
3786
+
3787
+ Raises:
3788
+ Exception: If cancellation fails
3789
+ """
3790
+ headers = self._prepare_headers()
3791
+ async with aiohttp.ClientSession() as session:
3792
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3793
+ return await response.json()
3794
+
3795
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3796
+ """
3797
+ Check the status of an asynchronous extraction job.
3798
+
3799
+ Args:
3800
+ job_id (str): The ID of the extraction job
3801
+
3802
+ Returns:
3803
+ ExtractResponse[Any] with:
3804
+ * success (bool): Whether request succeeded
3805
+ * data (Optional[Any]): Extracted data matching schema
3806
+ * error (Optional[str]): Error message if any
3807
+ * warning (Optional[str]): Warning message if any
3808
+ * sources (Optional[List[str]]): Source URLs if requested
3809
+
3810
+ Raises:
3811
+ ValueError: If status check fails
3812
+ """
3813
+ headers = self._prepare_headers()
3814
+ try:
3815
+ return await self._async_get_request(
3816
+ f'{self.api_url}/v1/extract/{job_id}',
3817
+ headers
3818
+ )
3819
+ except Exception as e:
3820
+ raise ValueError(str(e))
3821
+
3822
+ async def async_extract(
3823
+ self,
3824
+ urls: Optional[List[str]] = None,
3825
+ *,
3826
+ prompt: Optional[str] = None,
3827
+ schema: Optional[Any] = None,
3828
+ system_prompt: Optional[str] = None,
3829
+ allow_external_links: Optional[bool] = False,
3830
+ enable_web_search: Optional[bool] = False,
3831
+ show_sources: Optional[bool] = False,
3832
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3833
+ """
3834
+ Initiate an asynchronous extraction job without waiting for completion.
3835
+
3836
+ Args:
3837
+ urls (Optional[List[str]]): URLs to extract from
3838
+ prompt (Optional[str]): Custom extraction prompt
3839
+ schema (Optional[Any]): JSON schema/Pydantic model
3840
+ system_prompt (Optional[str]): System context
3841
+ allow_external_links (Optional[bool]): Follow external links
3842
+ enable_web_search (Optional[bool]): Enable web search
3843
+ show_sources (Optional[bool]): Include source URLs
3844
+ agent (Optional[Dict[str, Any]]): Agent configuration
3845
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3846
+
3847
+ Returns:
3848
+ ExtractResponse[Any] with:
3849
+ * success (bool): Whether request succeeded
3850
+ * data (Optional[Any]): Extracted data matching schema
3851
+ * error (Optional[str]): Error message if any
3852
+
3853
+ Raises:
3854
+ ValueError: If job initiation fails
3855
+ """
3856
+ headers = self._prepare_headers()
3857
+
3858
+ if not prompt and not schema:
3859
+ raise ValueError("Either prompt or schema is required")
3860
+
3861
+ if not urls and not prompt:
3862
+ raise ValueError("Either urls or prompt is required")
3863
+
3864
+ if schema:
3865
+ if hasattr(schema, 'model_json_schema'):
3866
+ schema = schema.model_json_schema()
3867
+
3868
+ request_data = ExtractResponse(
3869
+ urls=urls or [],
3870
+ allowExternalLinks=allow_external_links,
3871
+ enableWebSearch=enable_web_search,
3872
+ showSources=show_sources,
3873
+ schema=schema,
3874
+ origin=f'python-sdk@{version}'
3875
+ )
3876
+
3877
+ if prompt:
3878
+ request_data['prompt'] = prompt
3879
+ if system_prompt:
3880
+ request_data['systemPrompt'] = system_prompt
3881
+ if agent:
3882
+ request_data['agent'] = agent
3883
+
3884
+ try:
3885
+ return await self._async_post_request(
3886
+ f'{self.api_url}/v1/extract',
3887
+ request_data,
3888
+ headers
3889
+ )
3890
+ except Exception as e:
3891
+ raise ValueError(str(e))
3892
+
3893
+ async def generate_llms_text(
3894
+ self,
3895
+ url: str,
3896
+ *,
3897
+ max_urls: Optional[int] = None,
3898
+ show_full_text: Optional[bool] = None,
3899
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3900
+ """
3901
+ Generate LLMs.txt for a given URL and monitor until completion.
3902
+
3903
+ Args:
3904
+ url (str): Target URL to generate LLMs.txt from
3905
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3906
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3907
+ experimental_stream (Optional[bool]): Enable experimental streaming
3908
+
3909
+ Returns:
3910
+ GenerateLLMsTextStatusResponse containing:
3911
+ * success (bool): Whether generation completed successfully
3912
+ * status (str): Status of generation (processing/completed/failed)
3913
+ * data (Dict[str, str], optional): Generated text with fields:
3914
+ - llmstxt (str): Generated LLMs.txt content
3915
+ - llmsfulltxt (str, optional): Full version if requested
3916
+ * error (str, optional): Error message if generation failed
3917
+ * expiresAt (str): When the generated data expires
3918
+
3919
+ Raises:
3920
+ Exception: If generation fails
3921
+ """
3922
+ params = {}
3923
+ if max_urls is not None:
3924
+ params['maxUrls'] = max_urls
3925
+ if show_full_text is not None:
3926
+ params['showFullText'] = show_full_text
3927
+ if experimental_stream is not None:
3928
+ params['__experimental_stream'] = experimental_stream
3929
+
3930
+ response = await self.async_generate_llms_text(
3931
+ url,
3932
+ max_urls=max_urls,
3933
+ show_full_text=show_full_text,
3934
+ experimental_stream=experimental_stream
3935
+ )
3936
+ if not response.get('success') or 'id' not in response:
3937
+ return response
3938
+
3939
+ job_id = response['id']
3940
+ while True:
3941
+ status = await self.check_generate_llms_text_status(job_id)
3942
+
3943
+ if status['status'] == 'completed':
3944
+ return status
3945
+ elif status['status'] == 'failed':
3946
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
3947
+ elif status['status'] != 'processing':
3948
+ break
3949
+
3950
+ await asyncio.sleep(2)
3951
+
3952
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
3953
+
3954
+ async def async_generate_llms_text(
3955
+ self,
3956
+ url: str,
3957
+ *,
3958
+ max_urls: Optional[int] = None,
3959
+ show_full_text: Optional[bool] = None,
3960
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
3961
+ """
3962
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
3963
+
3964
+ Args:
3965
+ url (str): Target URL to generate LLMs.txt from
3966
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3967
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3968
+ experimental_stream (Optional[bool]): Enable experimental streaming
3969
+
3970
+ Returns:
3971
+ GenerateLLMsTextResponse containing:
3972
+ * success (bool): Whether job started successfully
3973
+ * id (str): Unique identifier for the job
3974
+ * error (str, optional): Error message if start failed
3975
+
3976
+ Raises:
3977
+ ValueError: If job initiation fails
3978
+ """
3979
+ params = {}
3980
+ if max_urls is not None:
3981
+ params['maxUrls'] = max_urls
3982
+ if show_full_text is not None:
3983
+ params['showFullText'] = show_full_text
3984
+ if experimental_stream is not None:
3985
+ params['__experimental_stream'] = experimental_stream
3986
+
3987
+ params = GenerateLLMsTextParams(
3988
+ maxUrls=max_urls,
3989
+ showFullText=show_full_text,
3990
+ __experimental_stream=experimental_stream
3991
+ )
3992
+
3993
+ headers = self._prepare_headers()
3994
+ json_data = {'url': url, **params.dict(exclude_none=True)}
3995
+ json_data['origin'] = f"python-sdk@{version}"
3996
+
3997
+ try:
3998
+ return await self._async_post_request(
3999
+ f'{self.api_url}/v1/llmstxt',
4000
+ json_data,
4001
+ headers
4002
+ )
4003
+ except Exception as e:
4004
+ raise ValueError(str(e))
4005
+
4006
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4007
+ """
4008
+ Check the status of an asynchronous LLMs.txt generation job.
4009
+
4010
+ Args:
4011
+ id (str): The ID of the generation job
4012
+
4013
+ Returns:
4014
+ GenerateLLMsTextStatusResponse containing:
4015
+ * success (bool): Whether generation completed successfully
4016
+ * status (str): Status of generation (processing/completed/failed)
4017
+ * data (Dict[str, str], optional): Generated text with fields:
4018
+ - llmstxt (str): Generated LLMs.txt content
4019
+ - llmsfulltxt (str, optional): Full version if requested
4020
+ * error (str, optional): Error message if generation failed
4021
+ * expiresAt (str): When the generated data expires
4022
+
4023
+ Raises:
4024
+ ValueError: If status check fails
4025
+ """
4026
+ headers = self._prepare_headers()
4027
+ try:
4028
+ return await self._async_get_request(
4029
+ f'{self.api_url}/v1/llmstxt/{id}',
4030
+ headers
4031
+ )
4032
+ except Exception as e:
4033
+ raise ValueError(str(e))
4034
+
4035
+ async def deep_research(
4036
+ self,
4037
+ query: str,
4038
+ *,
4039
+ max_depth: Optional[int] = None,
4040
+ time_limit: Optional[int] = None,
4041
+ max_urls: Optional[int] = None,
4042
+ analysis_prompt: Optional[str] = None,
4043
+ system_prompt: Optional[str] = None,
4044
+ __experimental_stream_steps: Optional[bool] = None,
4045
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4046
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4047
+ """
4048
+ Initiates a deep research operation on a given query and polls until completion.
4049
+
4050
+ Args:
4051
+ query (str): Research query or topic to investigate
4052
+ max_depth (Optional[int]): Maximum depth of research exploration
4053
+ time_limit (Optional[int]): Time limit in seconds for research
4054
+ max_urls (Optional[int]): Maximum number of URLs to process
4055
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4056
+ system_prompt (Optional[str]): Custom system prompt
4057
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4058
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4059
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4060
+
4061
+ Returns:
4062
+ DeepResearchStatusResponse containing:
4063
+ * success (bool): Whether research completed successfully
4064
+ * status (str): Current state (processing/completed/failed)
4065
+ * error (Optional[str]): Error message if failed
4066
+ * id (str): Unique identifier for the research job
4067
+ * data (Any): Research findings and analysis
4068
+ * sources (List[Dict]): List of discovered sources
4069
+ * activities (List[Dict]): Research progress log
4070
+ * summaries (List[str]): Generated research summaries
4071
+
4072
+ Raises:
4073
+ Exception: If research fails
4074
+ """
4075
+ research_params = {}
4076
+ if max_depth is not None:
4077
+ research_params['maxDepth'] = max_depth
4078
+ if time_limit is not None:
4079
+ research_params['timeLimit'] = time_limit
4080
+ if max_urls is not None:
4081
+ research_params['maxUrls'] = max_urls
4082
+ if analysis_prompt is not None:
4083
+ research_params['analysisPrompt'] = analysis_prompt
4084
+ if system_prompt is not None:
4085
+ research_params['systemPrompt'] = system_prompt
4086
+ if __experimental_stream_steps is not None:
4087
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4088
+ research_params = DeepResearchParams(**research_params)
4089
+
4090
+ response = await self.async_deep_research(
4091
+ query,
4092
+ max_depth=max_depth,
4093
+ time_limit=time_limit,
4094
+ max_urls=max_urls,
4095
+ analysis_prompt=analysis_prompt,
4096
+ system_prompt=system_prompt
4097
+ )
4098
+ if not response.get('success') or 'id' not in response:
4099
+ return response
4100
+
4101
+ job_id = response['id']
4102
+ last_activity_count = 0
4103
+ last_source_count = 0
4104
+
4105
+ while True:
4106
+ status = await self.check_deep_research_status(job_id)
4107
+
4108
+ if on_activity and 'activities' in status:
4109
+ new_activities = status['activities'][last_activity_count:]
4110
+ for activity in new_activities:
4111
+ on_activity(activity)
4112
+ last_activity_count = len(status['activities'])
4113
+
4114
+ if on_source and 'sources' in status:
4115
+ new_sources = status['sources'][last_source_count:]
4116
+ for source in new_sources:
4117
+ on_source(source)
4118
+ last_source_count = len(status['sources'])
4119
+
4120
+ if status['status'] == 'completed':
4121
+ return status
4122
+ elif status['status'] == 'failed':
4123
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4124
+ elif status['status'] != 'processing':
4125
+ break
4126
+
4127
+ await asyncio.sleep(2)
4128
+
4129
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4130
+
4131
+ async def async_deep_research(
4132
+ self,
4133
+ query: str,
4134
+ *,
4135
+ max_depth: Optional[int] = None,
4136
+ time_limit: Optional[int] = None,
4137
+ max_urls: Optional[int] = None,
4138
+ analysis_prompt: Optional[str] = None,
4139
+ system_prompt: Optional[str] = None,
4140
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4141
+ """
4142
+ Initiates an asynchronous deep research operation.
4143
+
4144
+ Args:
4145
+ query (str): Research query or topic to investigate
4146
+ max_depth (Optional[int]): Maximum depth of research exploration
4147
+ time_limit (Optional[int]): Time limit in seconds for research
4148
+ max_urls (Optional[int]): Maximum number of URLs to process
4149
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4150
+ system_prompt (Optional[str]): Custom system prompt
4151
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4152
+
4153
+ Returns:
4154
+ Dict[str, Any]: A response containing:
4155
+ * success (bool): Whether the research initiation was successful
4156
+ * id (str): The unique identifier for the research job
4157
+ * error (str, optional): Error message if initiation failed
4158
+
4159
+ Raises:
4160
+ Exception: If the research initiation fails.
4161
+ """
4162
+ research_params = {}
4163
+ if max_depth is not None:
4164
+ research_params['maxDepth'] = max_depth
4165
+ if time_limit is not None:
4166
+ research_params['timeLimit'] = time_limit
4167
+ if max_urls is not None:
4168
+ research_params['maxUrls'] = max_urls
4169
+ if analysis_prompt is not None:
4170
+ research_params['analysisPrompt'] = analysis_prompt
4171
+ if system_prompt is not None:
4172
+ research_params['systemPrompt'] = system_prompt
4173
+ if __experimental_stream_steps is not None:
4174
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4175
+ research_params = DeepResearchParams(**research_params)
4176
+
4177
+ headers = self._prepare_headers()
4178
+
4179
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4180
+ json_data['origin'] = f"python-sdk@{version}"
4181
+
4182
+ try:
4183
+ return await self._async_post_request(
4184
+ f'{self.api_url}/v1/deep-research',
4185
+ json_data,
4186
+ headers
4187
+ )
4188
+ except Exception as e:
4189
+ raise ValueError(str(e))
4190
+
4191
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4192
+ """
4193
+ Check the status of a deep research operation.
4194
+
4195
+ Args:
4196
+ id (str): The ID of the deep research operation.
4197
+
4198
+ Returns:
4199
+ DeepResearchResponse containing:
4200
+
4201
+ Status:
4202
+ * success - Whether research completed successfully
4203
+ * status - Current state (processing/completed/failed)
4204
+ * error - Error message if failed
4205
+
4206
+ Results:
4207
+ * id - Unique identifier for the research job
4208
+ * data - Research findings and analysis
4209
+ * sources - List of discovered sources
4210
+ * activities - Research progress log
4211
+ * summaries - Generated research summaries
4212
+
4213
+ Raises:
4214
+ Exception: If the status check fails.
4215
+ """
4216
+ headers = self._prepare_headers()
4217
+ try:
4218
+ return await self._async_get_request(
4219
+ f'{self.api_url}/v1/deep-research/{id}',
4220
+ headers
4221
+ )
4222
+ except Exception as e:
4223
+ raise ValueError(str(e))
4224
+
4225
+ async def search(
4226
+ self,
4227
+ query: str,
4228
+ *,
4229
+ limit: Optional[int] = None,
4230
+ tbs: Optional[str] = None,
4231
+ filter: Optional[str] = None,
4232
+ lang: Optional[str] = None,
4233
+ country: Optional[str] = None,
4234
+ location: Optional[str] = None,
4235
+ timeout: Optional[int] = None,
4236
+ scrape_options: Optional[ScrapeOptions] = None,
4237
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4238
+ **kwargs) -> SearchResponse:
4239
+ """
4240
+ Asynchronously search for content using Firecrawl.
4241
+
4242
+ Args:
4243
+ query (str): Search query string
4244
+ limit (Optional[int]): Max results (default: 5)
4245
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4246
+ filter (Optional[str]): Custom result filter
4247
+ lang (Optional[str]): Language code (default: "en")
4248
+ country (Optional[str]): Country code (default: "us")
4249
+ location (Optional[str]): Geo-targeting
4250
+ timeout (Optional[int]): Request timeout in milliseconds
4251
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4252
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4253
+ **kwargs: Additional keyword arguments for future compatibility
4254
+
4255
+ Returns:
4256
+ SearchResponse: Response containing:
4257
+ * success (bool): Whether request succeeded
4258
+ * data (List[FirecrawlDocument]): Search results
4259
+ * warning (Optional[str]): Warning message if any
4260
+ * error (Optional[str]): Error message if any
4261
+
4262
+ Raises:
4263
+ Exception: If search fails or response cannot be parsed
4264
+ """
4265
+ # Build search parameters
4266
+ search_params = {}
4267
+ if params:
4268
+ if isinstance(params, dict):
4269
+ search_params.update(params)
4270
+ else:
4271
+ search_params.update(params.dict(exclude_none=True))
4272
+
4273
+ # Add individual parameters
4274
+ if limit is not None:
4275
+ search_params['limit'] = limit
4276
+ if tbs is not None:
4277
+ search_params['tbs'] = tbs
4278
+ if filter is not None:
4279
+ search_params['filter'] = filter
4280
+ if lang is not None:
4281
+ search_params['lang'] = lang
4282
+ if country is not None:
4283
+ search_params['country'] = country
4284
+ if location is not None:
4285
+ search_params['location'] = location
4286
+ if timeout is not None:
4287
+ search_params['timeout'] = timeout
4288
+ if scrape_options is not None:
4289
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4290
+
4291
+ # Add any additional kwargs
4292
+ search_params.update(kwargs)
4293
+
4294
+ # Create final params object
4295
+ final_params = SearchParams(query=query, **search_params)
4296
+ params_dict = final_params.dict(exclude_none=True)
4297
+ params_dict['origin'] = f"python-sdk@{version}"
4298
+
4299
+ return await self._async_post_request(
4300
+ f"{self.api_url}/v1/search",
4301
+ params_dict,
4302
+ {"Authorization": f"Bearer {self.api_key}"}
4303
+ )
4304
+
4305
+ class AsyncCrawlWatcher(CrawlWatcher):
4306
+ """
4307
+ Async version of CrawlWatcher that properly handles async operations.
4308
+ """
4309
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4310
+ super().__init__(id, app)
4311
+
4312
+ async def connect(self) -> None:
4313
+ """
4314
+ Establishes async WebSocket connection and starts listening for messages.
4315
+ """
4316
+ async with websockets.connect(
4317
+ self.ws_url,
4318
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4319
+ ) as websocket:
4320
+ await self._listen(websocket)
4321
+
4322
+ async def _listen(self, websocket) -> None:
4323
+ """
4324
+ Listens for incoming WebSocket messages and handles them asynchronously.
4325
+
4326
+ Args:
4327
+ websocket: The WebSocket connection object
4328
+ """
4329
+ async for message in websocket:
4330
+ msg = json.loads(message)
4331
+ await self._handle_message(msg)
4332
+
4333
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4334
+ """
4335
+ Handles incoming WebSocket messages based on their type asynchronously.
4336
+
4337
+ Args:
4338
+ msg (Dict[str, Any]): The message to handle
4339
+ """
4340
+ if msg['type'] == 'done':
4341
+ self.status = 'completed'
4342
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4343
+ elif msg['type'] == 'error':
4344
+ self.status = 'failed'
4345
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4346
+ elif msg['type'] == 'catchup':
4347
+ self.status = msg['data']['status']
4348
+ self.data.extend(msg['data'].get('data', []))
4349
+ for doc in self.data:
4350
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4351
+ elif msg['type'] == 'document':
4352
+ self.data.append(msg['data'])
4353
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4354
+
4355
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4356
+ """
4357
+ Handle errors from async API responses.
4358
+ """
4359
+ try:
4360
+ error_data = await response.json()
4361
+ error_message = error_data.get('error', 'No error message provided.')
4362
+ error_details = error_data.get('details', 'No additional error details provided.')
4363
+ except:
4364
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4365
+
4366
+ # Use the app's method to get the error message
4367
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4368
+
4369
+ raise aiohttp.ClientError(message)
4370
+
4371
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4372
+ """
4373
+ Generate a standardized error message based on HTTP status code for async operations.
4374
+
4375
+ Args:
4376
+ status_code (int): The HTTP status code from the response
4377
+ action (str): Description of the action that was being performed
4378
+ error_message (str): The error message from the API response
4379
+ error_details (str): Additional error details from the API response
4380
+
4381
+ Returns:
4382
+ str: A formatted error message
4383
+ """
4384
+ return self._get_error_message(status_code, action, error_message, error_details)