firecrawl 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

@@ -0,0 +1,4291 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
47
+
48
+ logger : logging.Logger = logging.getLogger("firecrawl")
49
+
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+
100
+ class ChangeTrackingData(pydantic.BaseModel):
101
+ """
102
+ Data for the change tracking format.
103
+ """
104
+ previousScrapeAt: Optional[str] = None
105
+ changeStatus: str # "new" | "same" | "changed" | "removed"
106
+ visibility: str # "visible" | "hidden"
107
+ diff: Optional[Dict[str, Any]] = None
108
+ json: Optional[Any] = None
109
+
110
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
+ """Document retrieved or processed by Firecrawl."""
112
+ url: Optional[str] = None
113
+ markdown: Optional[str] = None
114
+ html: Optional[str] = None
115
+ rawHtml: Optional[str] = None
116
+ links: Optional[List[str]] = None
117
+ extract: Optional[T] = None
118
+ json: Optional[T] = None
119
+ screenshot: Optional[str] = None
120
+ metadata: Optional[Any] = None
121
+ actions: Optional[ActionsResult] = None
122
+ title: Optional[str] = None # v1 search only
123
+ description: Optional[str] = None # v1 search only
124
+ changeTracking: Optional[ChangeTrackingData] = None
125
+
126
+ class LocationConfig(pydantic.BaseModel):
127
+ """Location configuration for scraping."""
128
+ country: Optional[str] = None
129
+ languages: Optional[List[str]] = None
130
+
131
+ class WebhookConfig(pydantic.BaseModel):
132
+ """Configuration for webhooks."""
133
+ url: str
134
+ headers: Optional[Dict[str, str]] = None
135
+ metadata: Optional[Dict[str, str]] = None
136
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
+
138
+ class ScrapeOptions(pydantic.BaseModel):
139
+ """Parameters for scraping operations."""
140
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
141
+ headers: Optional[Dict[str, str]] = None
142
+ includeTags: Optional[List[str]] = None
143
+ excludeTags: Optional[List[str]] = None
144
+ onlyMainContent: Optional[bool] = None
145
+ waitFor: Optional[int] = None
146
+ timeout: Optional[int] = None
147
+ location: Optional[LocationConfig] = None
148
+ mobile: Optional[bool] = None
149
+ skipTlsVerification: Optional[bool] = None
150
+ removeBase64Images: Optional[bool] = None
151
+ blockAds: Optional[bool] = None
152
+ proxy: Optional[Literal["basic", "stealth"]] = None
153
+
154
+ class WaitAction(pydantic.BaseModel):
155
+ """Wait action to perform during scraping."""
156
+ type: Literal["wait"]
157
+ milliseconds: int
158
+ selector: Optional[str] = None
159
+
160
+ class ScreenshotAction(pydantic.BaseModel):
161
+ """Screenshot action to perform during scraping."""
162
+ type: Literal["screenshot"]
163
+ fullPage: Optional[bool] = None
164
+
165
+ class ClickAction(pydantic.BaseModel):
166
+ """Click action to perform during scraping."""
167
+ type: Literal["click"]
168
+ selector: str
169
+
170
+ class WriteAction(pydantic.BaseModel):
171
+ """Write action to perform during scraping."""
172
+ type: Literal["write"]
173
+ text: str
174
+
175
+ class PressAction(pydantic.BaseModel):
176
+ """Press action to perform during scraping."""
177
+ type: Literal["press"]
178
+ key: str
179
+
180
+ class ScrollAction(pydantic.BaseModel):
181
+ """Scroll action to perform during scraping."""
182
+ type: Literal["scroll"]
183
+ direction: Literal["up", "down"]
184
+ selector: Optional[str] = None
185
+
186
+ class ScrapeAction(pydantic.BaseModel):
187
+ """Scrape action to perform during scraping."""
188
+ type: Literal["scrape"]
189
+
190
+ class ExecuteJavascriptAction(pydantic.BaseModel):
191
+ """Execute javascript action to perform during scraping."""
192
+ type: Literal["executeJavascript"]
193
+ script: str
194
+
195
+
196
+ class ExtractAgent(pydantic.BaseModel):
197
+ """Configuration for the agent in extract operations."""
198
+ model: Literal["FIRE-1"] = "FIRE-1"
199
+
200
+ class JsonConfig(pydantic.BaseModel):
201
+ """Configuration for extraction."""
202
+ prompt: Optional[str] = None
203
+ schema: Optional[Any] = None
204
+ systemPrompt: Optional[str] = None
205
+ agent: Optional[ExtractAgent] = None
206
+
207
+ class ScrapeParams(ScrapeOptions):
208
+ """Parameters for scraping operations."""
209
+ extract: Optional[JsonConfig] = None
210
+ jsonOptions: Optional[JsonConfig] = None
211
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
212
+ agent: Optional[AgentOptions] = None
213
+
214
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
215
+ """Response from scraping operations."""
216
+ success: bool = True
217
+ warning: Optional[str] = None
218
+ error: Optional[str] = None
219
+
220
+ class BatchScrapeResponse(pydantic.BaseModel):
221
+ """Response from batch scrape operations."""
222
+ id: Optional[str] = None
223
+ url: Optional[str] = None
224
+ success: bool = True
225
+ error: Optional[str] = None
226
+ invalidURLs: Optional[List[str]] = None
227
+
228
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
229
+ """Response from batch scrape status checks."""
230
+ success: bool = True
231
+ status: Literal["scraping", "completed", "failed", "cancelled"]
232
+ completed: int
233
+ total: int
234
+ creditsUsed: int
235
+ expiresAt: datetime
236
+ next: Optional[str] = None
237
+ data: List[FirecrawlDocument]
238
+
239
+ class CrawlParams(pydantic.BaseModel):
240
+ """Parameters for crawling operations."""
241
+ includePaths: Optional[List[str]] = None
242
+ excludePaths: Optional[List[str]] = None
243
+ maxDepth: Optional[int] = None
244
+ maxDiscoveryDepth: Optional[int] = None
245
+ limit: Optional[int] = None
246
+ allowBackwardLinks: Optional[bool] = None
247
+ allowExternalLinks: Optional[bool] = None
248
+ ignoreSitemap: Optional[bool] = None
249
+ scrapeOptions: Optional[ScrapeOptions] = None
250
+ webhook: Optional[Union[str, WebhookConfig]] = None
251
+ deduplicateSimilarURLs: Optional[bool] = None
252
+ ignoreQueryParameters: Optional[bool] = None
253
+ regexOnFullURL: Optional[bool] = None
254
+
255
+ class CrawlResponse(pydantic.BaseModel):
256
+ """Response from crawling operations."""
257
+ id: Optional[str] = None
258
+ url: Optional[str] = None
259
+ success: bool = True
260
+ error: Optional[str] = None
261
+
262
+ class CrawlStatusResponse(pydantic.BaseModel):
263
+ """Response from crawl status checks."""
264
+ success: bool = True
265
+ status: Literal["scraping", "completed", "failed", "cancelled"]
266
+ completed: int
267
+ total: int
268
+ creditsUsed: int
269
+ expiresAt: datetime
270
+ next: Optional[str] = None
271
+ data: List[FirecrawlDocument]
272
+
273
+ class CrawlErrorsResponse(pydantic.BaseModel):
274
+ """Response from crawl/batch scrape error monitoring."""
275
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
276
+ robotsBlocked: List[str]
277
+
278
+ class MapParams(pydantic.BaseModel):
279
+ """Parameters for mapping operations."""
280
+ search: Optional[str] = None
281
+ ignoreSitemap: Optional[bool] = None
282
+ includeSubdomains: Optional[bool] = None
283
+ sitemapOnly: Optional[bool] = None
284
+ limit: Optional[int] = None
285
+ timeout: Optional[int] = None
286
+
287
+ class MapResponse(pydantic.BaseModel):
288
+ """Response from mapping operations."""
289
+ success: bool = True
290
+ links: Optional[List[str]] = None
291
+ error: Optional[str] = None
292
+
293
+ class ExtractParams(pydantic.BaseModel):
294
+ """Parameters for extracting information from URLs."""
295
+ prompt: Optional[str] = None
296
+ schema: Optional[Any] = None
297
+ systemPrompt: Optional[str] = None
298
+ allowExternalLinks: Optional[bool] = None
299
+ enableWebSearch: Optional[bool] = None
300
+ includeSubdomains: Optional[bool] = None
301
+ origin: Optional[str] = None
302
+ showSources: Optional[bool] = None
303
+ scrapeOptions: Optional[ScrapeOptions] = None
304
+
305
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
306
+ """Response from extract operations."""
307
+ success: bool = True
308
+ data: Optional[T] = None
309
+ error: Optional[str] = None
310
+ warning: Optional[str] = None
311
+ sources: Optional[List[str]] = None
312
+
313
+ class SearchParams(pydantic.BaseModel):
314
+ query: str
315
+ limit: Optional[int] = 5
316
+ tbs: Optional[str] = None
317
+ filter: Optional[str] = None
318
+ lang: Optional[str] = "en"
319
+ country: Optional[str] = "us"
320
+ location: Optional[str] = None
321
+ origin: Optional[str] = "api"
322
+ timeout: Optional[int] = 60000
323
+ scrapeOptions: Optional[ScrapeOptions] = None
324
+
325
+ class SearchResponse(pydantic.BaseModel):
326
+ """Response from search operations."""
327
+ success: bool = True
328
+ data: List[FirecrawlDocument]
329
+ warning: Optional[str] = None
330
+ error: Optional[str] = None
331
+
332
+ class GenerateLLMsTextParams(pydantic.BaseModel):
333
+ """
334
+ Parameters for the LLMs.txt generation operation.
335
+ """
336
+ maxUrls: Optional[int] = 10
337
+ showFullText: Optional[bool] = False
338
+ __experimental_stream: Optional[bool] = None
339
+
340
+ class DeepResearchParams(pydantic.BaseModel):
341
+ """
342
+ Parameters for the deep research operation.
343
+ """
344
+ maxDepth: Optional[int] = 7
345
+ timeLimit: Optional[int] = 270
346
+ maxUrls: Optional[int] = 20
347
+ analysisPrompt: Optional[str] = None
348
+ systemPrompt: Optional[str] = None
349
+ __experimental_streamSteps: Optional[bool] = None
350
+
351
+ class DeepResearchResponse(pydantic.BaseModel):
352
+ """
353
+ Response from the deep research operation.
354
+ """
355
+ success: bool
356
+ id: str
357
+ error: Optional[str] = None
358
+
359
+ class DeepResearchStatusResponse(pydantic.BaseModel):
360
+ """
361
+ Status response from the deep research operation.
362
+ """
363
+ success: bool
364
+ data: Optional[Dict[str, Any]] = None
365
+ status: str
366
+ error: Optional[str] = None
367
+ expiresAt: str
368
+ currentDepth: int
369
+ maxDepth: int
370
+ activities: List[Dict[str, Any]]
371
+ sources: List[Dict[str, Any]]
372
+ summaries: List[str]
373
+
374
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
375
+ """Response from LLMs.txt generation operations."""
376
+ success: bool = True
377
+ id: str
378
+ error: Optional[str] = None
379
+
380
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
381
+ llmstxt: str
382
+ llmsfulltxt: Optional[str] = None
383
+
384
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
385
+ """Status response from LLMs.txt generation operations."""
386
+ success: bool = True
387
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
388
+ status: Literal["processing", "completed", "failed"]
389
+ error: Optional[str] = None
390
+ expiresAt: str
391
+
392
+ class SearchResponse(pydantic.BaseModel):
393
+ """
394
+ Response from the search operation.
395
+ """
396
+ success: bool
397
+ data: List[Dict[str, Any]]
398
+ warning: Optional[str] = None
399
+ error: Optional[str] = None
400
+
401
+ class ExtractParams(pydantic.BaseModel):
402
+ """
403
+ Parameters for the extract operation.
404
+ """
405
+ prompt: Optional[str] = None
406
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
407
+ system_prompt: Optional[str] = None
408
+ allow_external_links: Optional[bool] = False
409
+ enable_web_search: Optional[bool] = False
410
+ # Just for backwards compatibility
411
+ enableWebSearch: Optional[bool] = False
412
+ show_sources: Optional[bool] = False
413
+ agent: Optional[Dict[str, Any]] = None
414
+
415
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
416
+ """
417
+ Response from the extract operation.
418
+ """
419
+ success: bool
420
+ data: Optional[T] = None
421
+ error: Optional[str] = None
422
+
423
+ class FirecrawlApp:
424
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
425
+ """
426
+ Initialize the FirecrawlApp instance with API key, API URL.
427
+
428
+ Args:
429
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
430
+ api_url (Optional[str]): Base URL for the Firecrawl API.
431
+ """
432
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
433
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
434
+
435
+ # Only require API key when using cloud service
436
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
437
+ logger.warning("No API key provided for cloud service")
438
+ raise ValueError('No API key provided')
439
+
440
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
441
+
442
+ def scrape_url(
443
+ self,
444
+ url: str,
445
+ *,
446
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
447
+ include_tags: Optional[List[str]] = None,
448
+ exclude_tags: Optional[List[str]] = None,
449
+ only_main_content: Optional[bool] = None,
450
+ wait_for: Optional[int] = None,
451
+ timeout: Optional[int] = None,
452
+ location: Optional[LocationConfig] = None,
453
+ mobile: Optional[bool] = None,
454
+ skip_tls_verification: Optional[bool] = None,
455
+ remove_base64_images: Optional[bool] = None,
456
+ block_ads: Optional[bool] = None,
457
+ proxy: Optional[Literal["basic", "stealth"]] = None,
458
+ extract: Optional[JsonConfig] = None,
459
+ json_options: Optional[JsonConfig] = None,
460
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
461
+ **kwargs) -> ScrapeResponse[Any]:
462
+ """
463
+ Scrape and extract content from a URL.
464
+
465
+ Args:
466
+ url (str): Target URL to scrape
467
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
468
+ include_tags (Optional[List[str]]): HTML tags to include
469
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
470
+ only_main_content (Optional[bool]): Extract main content only
471
+ wait_for (Optional[int]): Wait for a specific element to appear
472
+ timeout (Optional[int]): Request timeout (ms)
473
+ location (Optional[LocationConfig]): Location configuration
474
+ mobile (Optional[bool]): Use mobile user agent
475
+ skip_tls_verification (Optional[bool]): Skip TLS verification
476
+ remove_base64_images (Optional[bool]): Remove base64 images
477
+ block_ads (Optional[bool]): Block ads
478
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
479
+ extract (Optional[JsonConfig]): Content extraction settings
480
+ json_options (Optional[JsonConfig]): JSON extraction settings
481
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
482
+
483
+
484
+ Returns:
485
+ ScrapeResponse with:
486
+ * Requested content formats
487
+ * Page metadata
488
+ * Extraction results
489
+ * Success/error status
490
+
491
+ Raises:
492
+ Exception: If scraping fails
493
+ """
494
+ headers = self._prepare_headers()
495
+
496
+ # Build scrape parameters
497
+ scrape_params = {
498
+ 'url': url,
499
+ 'origin': f"python-sdk@{version}"
500
+ }
501
+
502
+ # Add optional parameters if provided
503
+ if formats:
504
+ scrape_params['formats'] = formats
505
+ if include_tags:
506
+ scrape_params['includeTags'] = include_tags
507
+ if exclude_tags:
508
+ scrape_params['excludeTags'] = exclude_tags
509
+ if only_main_content is not None:
510
+ scrape_params['onlyMainContent'] = only_main_content
511
+ if wait_for:
512
+ scrape_params['waitFor'] = wait_for
513
+ if timeout:
514
+ scrape_params['timeout'] = timeout
515
+ if location:
516
+ scrape_params['location'] = location.dict(exclude_none=True)
517
+ if mobile is not None:
518
+ scrape_params['mobile'] = mobile
519
+ if skip_tls_verification is not None:
520
+ scrape_params['skipTlsVerification'] = skip_tls_verification
521
+ if remove_base64_images is not None:
522
+ scrape_params['removeBase64Images'] = remove_base64_images
523
+ if block_ads is not None:
524
+ scrape_params['blockAds'] = block_ads
525
+ if proxy:
526
+ scrape_params['proxy'] = proxy
527
+ if extract:
528
+ if hasattr(extract.schema, 'schema'):
529
+ extract.schema = extract.schema.schema()
530
+ scrape_params['extract'] = extract.dict(exclude_none=True)
531
+ if json_options:
532
+ if hasattr(json_options.schema, 'schema'):
533
+ json_options.schema = json_options.schema.schema()
534
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
535
+ if actions:
536
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
537
+ scrape_params.update(kwargs)
538
+
539
+ # Make request
540
+ response = requests.post(
541
+ f'{self.api_url}/v1/scrape',
542
+ headers=headers,
543
+ json=scrape_params,
544
+ timeout=(timeout + 5000 if timeout else None)
545
+ )
546
+
547
+ if response.status_code == 200:
548
+ try:
549
+ response_json = response.json()
550
+ if response_json.get('success') and 'data' in response_json:
551
+ return ScrapeResponse(**response_json['data'])
552
+ elif "error" in response_json:
553
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
554
+ else:
555
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
556
+ except ValueError:
557
+ raise Exception('Failed to parse Firecrawl response as JSON.')
558
+ else:
559
+ self._handle_error(response, 'scrape URL')
560
+
561
+ def search(
562
+ self,
563
+ query: str,
564
+ *,
565
+ limit: Optional[int] = None,
566
+ tbs: Optional[str] = None,
567
+ filter: Optional[str] = None,
568
+ lang: Optional[str] = None,
569
+ country: Optional[str] = None,
570
+ location: Optional[str] = None,
571
+ timeout: Optional[int] = None,
572
+ scrape_options: Optional[ScrapeOptions] = None,
573
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
574
+ **kwargs) -> SearchResponse:
575
+ """
576
+ Search for content using Firecrawl.
577
+
578
+ Args:
579
+ query (str): Search query string
580
+ limit (Optional[int]): Max results (default: 5)
581
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
582
+ filter (Optional[str]): Custom result filter
583
+ lang (Optional[str]): Language code (default: "en")
584
+ country (Optional[str]): Country code (default: "us")
585
+ location (Optional[str]): Geo-targeting
586
+ timeout (Optional[int]): Request timeout in milliseconds
587
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
588
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
589
+ **kwargs: Additional keyword arguments for future compatibility
590
+
591
+ Returns:
592
+ SearchResponse: Response containing:
593
+ * success (bool): Whether request succeeded
594
+ * data (List[FirecrawlDocument]): Search results
595
+ * warning (Optional[str]): Warning message if any
596
+ * error (Optional[str]): Error message if any
597
+
598
+ Raises:
599
+ Exception: If search fails or response cannot be parsed
600
+ """
601
+ # Build search parameters
602
+ search_params = {}
603
+ if params:
604
+ if isinstance(params, dict):
605
+ search_params.update(params)
606
+ else:
607
+ search_params.update(params.dict(exclude_none=True))
608
+
609
+ # Add individual parameters
610
+ if limit is not None:
611
+ search_params['limit'] = limit
612
+ if tbs is not None:
613
+ search_params['tbs'] = tbs
614
+ if filter is not None:
615
+ search_params['filter'] = filter
616
+ if lang is not None:
617
+ search_params['lang'] = lang
618
+ if country is not None:
619
+ search_params['country'] = country
620
+ if location is not None:
621
+ search_params['location'] = location
622
+ if timeout is not None:
623
+ search_params['timeout'] = timeout
624
+ if scrape_options is not None:
625
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
626
+
627
+ # Add any additional kwargs
628
+ search_params.update(kwargs)
629
+
630
+ # Create final params object
631
+ final_params = SearchParams(query=query, **search_params)
632
+ params_dict = final_params.dict(exclude_none=True)
633
+ params_dict['origin'] = f"python-sdk@{version}"
634
+
635
+ # Make request
636
+ response = requests.post(
637
+ f"{self.api_url}/v1/search",
638
+ headers={"Authorization": f"Bearer {self.api_key}"},
639
+ json=params_dict
640
+ )
641
+
642
+ if response.status_code == 200:
643
+ try:
644
+ response_json = response.json()
645
+ if response_json.get('success') and 'data' in response_json:
646
+ return SearchResponse(**response_json)
647
+ elif "error" in response_json:
648
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
649
+ else:
650
+ raise Exception(f'Search failed. Error: {response_json}')
651
+ except ValueError:
652
+ raise Exception('Failed to parse Firecrawl response as JSON.')
653
+ else:
654
+ self._handle_error(response, 'search')
655
+
656
+ def crawl_url(
657
+ self,
658
+ url: str,
659
+ *,
660
+ include_paths: Optional[List[str]] = None,
661
+ exclude_paths: Optional[List[str]] = None,
662
+ max_depth: Optional[int] = None,
663
+ max_discovery_depth: Optional[int] = None,
664
+ limit: Optional[int] = None,
665
+ allow_backward_links: Optional[bool] = None,
666
+ allow_external_links: Optional[bool] = None,
667
+ ignore_sitemap: Optional[bool] = None,
668
+ scrape_options: Optional[ScrapeOptions] = None,
669
+ webhook: Optional[Union[str, WebhookConfig]] = None,
670
+ deduplicate_similar_urls: Optional[bool] = None,
671
+ ignore_query_parameters: Optional[bool] = None,
672
+ regex_on_full_url: Optional[bool] = None,
673
+ poll_interval: Optional[int] = 2,
674
+ idempotency_key: Optional[str] = None,
675
+ **kwargs
676
+ ) -> CrawlStatusResponse:
677
+ """
678
+ Crawl a website starting from a URL.
679
+
680
+ Args:
681
+ url (str): Target URL to start crawling from
682
+ include_paths (Optional[List[str]]): Patterns of URLs to include
683
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
684
+ max_depth (Optional[int]): Maximum crawl depth
685
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
686
+ limit (Optional[int]): Maximum pages to crawl
687
+ allow_backward_links (Optional[bool]): Follow parent directory links
688
+ allow_external_links (Optional[bool]): Follow external domain links
689
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
690
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
691
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
692
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
693
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
694
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
695
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
696
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
697
+ **kwargs: Additional parameters to pass to the API
698
+
699
+ Returns:
700
+ CrawlStatusResponse with:
701
+ * Crawling status and progress
702
+ * Crawled page contents
703
+ * Success/error information
704
+
705
+ Raises:
706
+ Exception: If crawl fails
707
+ """
708
+ crawl_params = {}
709
+
710
+ # Add individual parameters
711
+ if include_paths is not None:
712
+ crawl_params['includePaths'] = include_paths
713
+ if exclude_paths is not None:
714
+ crawl_params['excludePaths'] = exclude_paths
715
+ if max_depth is not None:
716
+ crawl_params['maxDepth'] = max_depth
717
+ if max_discovery_depth is not None:
718
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
719
+ if limit is not None:
720
+ crawl_params['limit'] = limit
721
+ if allow_backward_links is not None:
722
+ crawl_params['allowBackwardLinks'] = allow_backward_links
723
+ if allow_external_links is not None:
724
+ crawl_params['allowExternalLinks'] = allow_external_links
725
+ if ignore_sitemap is not None:
726
+ crawl_params['ignoreSitemap'] = ignore_sitemap
727
+ if scrape_options is not None:
728
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
729
+ if webhook is not None:
730
+ crawl_params['webhook'] = webhook
731
+ if deduplicate_similar_urls is not None:
732
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
733
+ if ignore_query_parameters is not None:
734
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
735
+ if regex_on_full_url is not None:
736
+ crawl_params['regexOnFullURL'] = regex_on_full_url
737
+
738
+ # Add any additional kwargs
739
+ crawl_params.update(kwargs)
740
+
741
+ # Create final params object
742
+ final_params = CrawlParams(**crawl_params)
743
+ params_dict = final_params.dict(exclude_none=True)
744
+ params_dict['url'] = url
745
+ params_dict['origin'] = f"python-sdk@{version}"
746
+
747
+ # Make request
748
+ headers = self._prepare_headers(idempotency_key)
749
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
750
+
751
+ if response.status_code == 200:
752
+ try:
753
+ id = response.json().get('id')
754
+ except:
755
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
756
+ return self._monitor_job_status(id, headers, poll_interval)
757
+ else:
758
+ self._handle_error(response, 'start crawl job')
759
+
760
+ def async_crawl_url(
761
+ self,
762
+ url: str,
763
+ *,
764
+ include_paths: Optional[List[str]] = None,
765
+ exclude_paths: Optional[List[str]] = None,
766
+ max_depth: Optional[int] = None,
767
+ max_discovery_depth: Optional[int] = None,
768
+ limit: Optional[int] = None,
769
+ allow_backward_links: Optional[bool] = None,
770
+ allow_external_links: Optional[bool] = None,
771
+ ignore_sitemap: Optional[bool] = None,
772
+ scrape_options: Optional[ScrapeOptions] = None,
773
+ webhook: Optional[Union[str, WebhookConfig]] = None,
774
+ deduplicate_similar_urls: Optional[bool] = None,
775
+ ignore_query_parameters: Optional[bool] = None,
776
+ regex_on_full_url: Optional[bool] = None,
777
+ idempotency_key: Optional[str] = None,
778
+ **kwargs
779
+ ) -> CrawlResponse:
780
+ """
781
+ Start an asynchronous crawl job.
782
+
783
+ Args:
784
+ url (str): Target URL to start crawling from
785
+ include_paths (Optional[List[str]]): Patterns of URLs to include
786
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
787
+ max_depth (Optional[int]): Maximum crawl depth
788
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
789
+ limit (Optional[int]): Maximum pages to crawl
790
+ allow_backward_links (Optional[bool]): Follow parent directory links
791
+ allow_external_links (Optional[bool]): Follow external domain links
792
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
793
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
794
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
795
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
796
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
797
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
798
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
799
+ **kwargs: Additional parameters to pass to the API
800
+
801
+ Returns:
802
+ CrawlResponse with:
803
+ * success - Whether crawl started successfully
804
+ * id - Unique identifier for the crawl job
805
+ * url - Status check URL for the crawl
806
+ * error - Error message if start failed
807
+
808
+ Raises:
809
+ Exception: If crawl initiation fails
810
+ """
811
+ crawl_params = {}
812
+
813
+ # Add individual parameters
814
+ if include_paths is not None:
815
+ crawl_params['includePaths'] = include_paths
816
+ if exclude_paths is not None:
817
+ crawl_params['excludePaths'] = exclude_paths
818
+ if max_depth is not None:
819
+ crawl_params['maxDepth'] = max_depth
820
+ if max_discovery_depth is not None:
821
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
822
+ if limit is not None:
823
+ crawl_params['limit'] = limit
824
+ if allow_backward_links is not None:
825
+ crawl_params['allowBackwardLinks'] = allow_backward_links
826
+ if allow_external_links is not None:
827
+ crawl_params['allowExternalLinks'] = allow_external_links
828
+ if ignore_sitemap is not None:
829
+ crawl_params['ignoreSitemap'] = ignore_sitemap
830
+ if scrape_options is not None:
831
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
832
+ if webhook is not None:
833
+ crawl_params['webhook'] = webhook
834
+ if deduplicate_similar_urls is not None:
835
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
836
+ if ignore_query_parameters is not None:
837
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
838
+ if regex_on_full_url is not None:
839
+ crawl_params['regexOnFullURL'] = regex_on_full_url
840
+
841
+ # Add any additional kwargs
842
+ crawl_params.update(kwargs)
843
+
844
+ # Create final params object
845
+ final_params = CrawlParams(**crawl_params)
846
+ params_dict = final_params.dict(exclude_none=True)
847
+ params_dict['url'] = url
848
+ params_dict['origin'] = f"python-sdk@{version}"
849
+
850
+ # Make request
851
+ headers = self._prepare_headers(idempotency_key)
852
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
853
+
854
+ if response.status_code == 200:
855
+ try:
856
+ return CrawlResponse(**response.json())
857
+ except:
858
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
859
+ else:
860
+ self._handle_error(response, 'start crawl job')
861
+
862
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
863
+ """
864
+ Check the status and results of a crawl job.
865
+
866
+ Args:
867
+ id: Unique identifier for the crawl job
868
+
869
+ Returns:
870
+ CrawlStatusResponse containing:
871
+
872
+ Status Information:
873
+ * status - Current state (scraping/completed/failed/cancelled)
874
+ * completed - Number of pages crawled
875
+ * total - Total pages to crawl
876
+ * creditsUsed - API credits consumed
877
+ * expiresAt - Data expiration timestamp
878
+
879
+ Results:
880
+ * data - List of crawled documents
881
+ * next - URL for next page of results (if paginated)
882
+ * success - Whether status check succeeded
883
+ * error - Error message if failed
884
+
885
+ Raises:
886
+ Exception: If status check fails
887
+ """
888
+ endpoint = f'/v1/crawl/{id}'
889
+
890
+ headers = self._prepare_headers()
891
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
892
+ if response.status_code == 200:
893
+ try:
894
+ status_data = response.json()
895
+ except:
896
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
897
+ if status_data['status'] == 'completed':
898
+ if 'data' in status_data:
899
+ data = status_data['data']
900
+ while 'next' in status_data:
901
+ if len(status_data['data']) == 0:
902
+ break
903
+ next_url = status_data.get('next')
904
+ if not next_url:
905
+ logger.warning("Expected 'next' URL is missing.")
906
+ break
907
+ try:
908
+ status_response = self._get_request(next_url, headers)
909
+ if status_response.status_code != 200:
910
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
911
+ break
912
+ try:
913
+ next_data = status_response.json()
914
+ except:
915
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
916
+ data.extend(next_data.get('data', []))
917
+ status_data = next_data
918
+ except Exception as e:
919
+ logger.error(f"Error during pagination request: {e}")
920
+ break
921
+ status_data['data'] = data
922
+
923
+ response = {
924
+ 'status': status_data.get('status'),
925
+ 'total': status_data.get('total'),
926
+ 'completed': status_data.get('completed'),
927
+ 'creditsUsed': status_data.get('creditsUsed'),
928
+ 'expiresAt': status_data.get('expiresAt'),
929
+ 'data': status_data.get('data')
930
+ }
931
+
932
+ if 'error' in status_data:
933
+ response['error'] = status_data['error']
934
+
935
+ if 'next' in status_data:
936
+ response['next'] = status_data['next']
937
+
938
+ return CrawlStatusResponse(
939
+ success=False if 'error' in status_data else True,
940
+ **response
941
+ )
942
+ else:
943
+ self._handle_error(response, 'check crawl status')
944
+
945
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
946
+ """
947
+ Returns information about crawl errors.
948
+
949
+ Args:
950
+ id (str): The ID of the crawl job
951
+
952
+ Returns:
953
+ CrawlErrorsResponse containing:
954
+ * errors (List[Dict[str, str]]): List of errors with fields:
955
+ - id (str): Error ID
956
+ - timestamp (str): When the error occurred
957
+ - url (str): URL that caused the error
958
+ - error (str): Error message
959
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
960
+
961
+ Raises:
962
+ Exception: If error check fails
963
+ """
964
+ headers = self._prepare_headers()
965
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
966
+ if response.status_code == 200:
967
+ try:
968
+ return CrawlErrorsResponse(**response.json())
969
+ except:
970
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
971
+ else:
972
+ self._handle_error(response, "check crawl errors")
973
+
974
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
975
+ """
976
+ Cancel an asynchronous crawl job.
977
+
978
+ Args:
979
+ id (str): The ID of the crawl job to cancel
980
+
981
+ Returns:
982
+ Dict[str, Any] containing:
983
+ * success (bool): Whether cancellation was successful
984
+ * error (str, optional): Error message if cancellation failed
985
+
986
+ Raises:
987
+ Exception: If cancellation fails
988
+ """
989
+ headers = self._prepare_headers()
990
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
991
+ if response.status_code == 200:
992
+ try:
993
+ return response.json()
994
+ except:
995
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
996
+ else:
997
+ self._handle_error(response, "cancel crawl job")
998
+
999
+ def crawl_url_and_watch(
1000
+ self,
1001
+ url: str,
1002
+ *,
1003
+ include_paths: Optional[List[str]] = None,
1004
+ exclude_paths: Optional[List[str]] = None,
1005
+ max_depth: Optional[int] = None,
1006
+ max_discovery_depth: Optional[int] = None,
1007
+ limit: Optional[int] = None,
1008
+ allow_backward_links: Optional[bool] = None,
1009
+ allow_external_links: Optional[bool] = None,
1010
+ ignore_sitemap: Optional[bool] = None,
1011
+ scrape_options: Optional[ScrapeOptions] = None,
1012
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1013
+ deduplicate_similar_urls: Optional[bool] = None,
1014
+ ignore_query_parameters: Optional[bool] = None,
1015
+ regex_on_full_url: Optional[bool] = None,
1016
+ idempotency_key: Optional[str] = None,
1017
+ **kwargs
1018
+ ) -> 'CrawlWatcher':
1019
+ """
1020
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1021
+
1022
+ Args:
1023
+ url (str): Target URL to start crawling from
1024
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1025
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1026
+ max_depth (Optional[int]): Maximum crawl depth
1027
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1028
+ limit (Optional[int]): Maximum pages to crawl
1029
+ allow_backward_links (Optional[bool]): Follow parent directory links
1030
+ allow_external_links (Optional[bool]): Follow external domain links
1031
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1032
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1033
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1034
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1035
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1036
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1037
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1038
+ **kwargs: Additional parameters to pass to the API
1039
+
1040
+ Returns:
1041
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1042
+
1043
+ Raises:
1044
+ Exception: If crawl job fails to start
1045
+ """
1046
+ crawl_response = self.async_crawl_url(
1047
+ url,
1048
+ include_paths=include_paths,
1049
+ exclude_paths=exclude_paths,
1050
+ max_depth=max_depth,
1051
+ max_discovery_depth=max_discovery_depth,
1052
+ limit=limit,
1053
+ allow_backward_links=allow_backward_links,
1054
+ allow_external_links=allow_external_links,
1055
+ ignore_sitemap=ignore_sitemap,
1056
+ scrape_options=scrape_options,
1057
+ webhook=webhook,
1058
+ deduplicate_similar_urls=deduplicate_similar_urls,
1059
+ ignore_query_parameters=ignore_query_parameters,
1060
+ regex_on_full_url=regex_on_full_url,
1061
+ idempotency_key=idempotency_key,
1062
+ **kwargs
1063
+ )
1064
+ if crawl_response.success and crawl_response.id:
1065
+ return CrawlWatcher(crawl_response.id, self)
1066
+ else:
1067
+ raise Exception("Crawl job failed to start")
1068
+
1069
+ def map_url(
1070
+ self,
1071
+ url: str,
1072
+ *,
1073
+ search: Optional[str] = None,
1074
+ ignore_sitemap: Optional[bool] = None,
1075
+ include_subdomains: Optional[bool] = None,
1076
+ sitemap_only: Optional[bool] = None,
1077
+ limit: Optional[int] = None,
1078
+ timeout: Optional[int] = None,
1079
+ params: Optional[MapParams] = None) -> MapResponse:
1080
+ """
1081
+ Map and discover links from a URL.
1082
+
1083
+ Args:
1084
+ url (str): Target URL to map
1085
+ search (Optional[str]): Filter pattern for URLs
1086
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1087
+ include_subdomains (Optional[bool]): Include subdomain links
1088
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1089
+ limit (Optional[int]): Maximum URLs to return
1090
+ timeout (Optional[int]): Request timeout in milliseconds
1091
+ params (Optional[MapParams]): Additional mapping parameters
1092
+
1093
+ Returns:
1094
+ MapResponse: Response containing:
1095
+ * success (bool): Whether request succeeded
1096
+ * links (List[str]): Discovered URLs
1097
+ * error (Optional[str]): Error message if any
1098
+
1099
+ Raises:
1100
+ Exception: If mapping fails or response cannot be parsed
1101
+ """
1102
+ # Build map parameters
1103
+ map_params = {}
1104
+ if params:
1105
+ map_params.update(params.dict(exclude_none=True))
1106
+
1107
+ # Add individual parameters
1108
+ if search is not None:
1109
+ map_params['search'] = search
1110
+ if ignore_sitemap is not None:
1111
+ map_params['ignoreSitemap'] = ignore_sitemap
1112
+ if include_subdomains is not None:
1113
+ map_params['includeSubdomains'] = include_subdomains
1114
+ if sitemap_only is not None:
1115
+ map_params['sitemapOnly'] = sitemap_only
1116
+ if limit is not None:
1117
+ map_params['limit'] = limit
1118
+ if timeout is not None:
1119
+ map_params['timeout'] = timeout
1120
+
1121
+ # Create final params object
1122
+ final_params = MapParams(**map_params)
1123
+ params_dict = final_params.dict(exclude_none=True)
1124
+ params_dict['url'] = url
1125
+ params_dict['origin'] = f"python-sdk@{version}"
1126
+
1127
+ # Make request
1128
+ response = requests.post(
1129
+ f"{self.api_url}/v1/map",
1130
+ headers={"Authorization": f"Bearer {self.api_key}"},
1131
+ json=params_dict
1132
+ )
1133
+
1134
+ if response.status_code == 200:
1135
+ try:
1136
+ response_json = response.json()
1137
+ if response_json.get('success') and 'links' in response_json:
1138
+ return MapResponse(**response_json)
1139
+ elif "error" in response_json:
1140
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1141
+ else:
1142
+ raise Exception(f'Map failed. Error: {response_json}')
1143
+ except ValueError:
1144
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1145
+ else:
1146
+ self._handle_error(response, 'map')
1147
+
1148
+ def batch_scrape_urls(
1149
+ self,
1150
+ urls: List[str],
1151
+ *,
1152
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1153
+ headers: Optional[Dict[str, str]] = None,
1154
+ include_tags: Optional[List[str]] = None,
1155
+ exclude_tags: Optional[List[str]] = None,
1156
+ only_main_content: Optional[bool] = None,
1157
+ wait_for: Optional[int] = None,
1158
+ timeout: Optional[int] = None,
1159
+ location: Optional[LocationConfig] = None,
1160
+ mobile: Optional[bool] = None,
1161
+ skip_tls_verification: Optional[bool] = None,
1162
+ remove_base64_images: Optional[bool] = None,
1163
+ block_ads: Optional[bool] = None,
1164
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1165
+ extract: Optional[JsonConfig] = None,
1166
+ json_options: Optional[JsonConfig] = None,
1167
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1168
+ agent: Optional[AgentOptions] = None,
1169
+ poll_interval: Optional[int] = 2,
1170
+ idempotency_key: Optional[str] = None,
1171
+ **kwargs
1172
+ ) -> BatchScrapeStatusResponse:
1173
+ """
1174
+ Batch scrape multiple URLs and monitor until completion.
1175
+
1176
+ Args:
1177
+ urls (List[str]): URLs to scrape
1178
+ formats (Optional[List[Literal]]): Content formats to retrieve
1179
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1180
+ include_tags (Optional[List[str]]): HTML tags to include
1181
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1182
+ only_main_content (Optional[bool]): Extract main content only
1183
+ wait_for (Optional[int]): Wait time in milliseconds
1184
+ timeout (Optional[int]): Request timeout in milliseconds
1185
+ location (Optional[LocationConfig]): Location configuration
1186
+ mobile (Optional[bool]): Use mobile user agent
1187
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1188
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1189
+ block_ads (Optional[bool]): Block advertisements
1190
+ proxy (Optional[Literal]): Proxy type to use
1191
+ extract (Optional[JsonConfig]): Content extraction config
1192
+ json_options (Optional[JsonConfig]): JSON extraction config
1193
+ actions (Optional[List[Union]]): Actions to perform
1194
+ agent (Optional[AgentOptions]): Agent configuration
1195
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1196
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1197
+ **kwargs: Additional parameters to pass to the API
1198
+
1199
+ Returns:
1200
+ BatchScrapeStatusResponse with:
1201
+ * Scraping status and progress
1202
+ * Scraped content for each URL
1203
+ * Success/error information
1204
+
1205
+ Raises:
1206
+ Exception: If batch scrape fails
1207
+ """
1208
+ scrape_params = {}
1209
+
1210
+ # Add individual parameters
1211
+ if formats is not None:
1212
+ scrape_params['formats'] = formats
1213
+ if headers is not None:
1214
+ scrape_params['headers'] = headers
1215
+ if include_tags is not None:
1216
+ scrape_params['includeTags'] = include_tags
1217
+ if exclude_tags is not None:
1218
+ scrape_params['excludeTags'] = exclude_tags
1219
+ if only_main_content is not None:
1220
+ scrape_params['onlyMainContent'] = only_main_content
1221
+ if wait_for is not None:
1222
+ scrape_params['waitFor'] = wait_for
1223
+ if timeout is not None:
1224
+ scrape_params['timeout'] = timeout
1225
+ if location is not None:
1226
+ scrape_params['location'] = location.dict(exclude_none=True)
1227
+ if mobile is not None:
1228
+ scrape_params['mobile'] = mobile
1229
+ if skip_tls_verification is not None:
1230
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1231
+ if remove_base64_images is not None:
1232
+ scrape_params['removeBase64Images'] = remove_base64_images
1233
+ if block_ads is not None:
1234
+ scrape_params['blockAds'] = block_ads
1235
+ if proxy is not None:
1236
+ scrape_params['proxy'] = proxy
1237
+ if extract is not None:
1238
+ if hasattr(extract.schema, 'schema'):
1239
+ extract.schema = extract.schema.schema()
1240
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1241
+ if json_options is not None:
1242
+ if hasattr(json_options.schema, 'schema'):
1243
+ json_options.schema = json_options.schema.schema()
1244
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1245
+ if actions is not None:
1246
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1247
+ if agent is not None:
1248
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1249
+
1250
+ # Add any additional kwargs
1251
+ scrape_params.update(kwargs)
1252
+
1253
+ # Create final params object
1254
+ final_params = ScrapeParams(**scrape_params)
1255
+ params_dict = final_params.dict(exclude_none=True)
1256
+ params_dict['urls'] = urls
1257
+ params_dict['origin'] = f"python-sdk@{version}"
1258
+
1259
+ # Make request
1260
+ headers = self._prepare_headers(idempotency_key)
1261
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1262
+
1263
+ if response.status_code == 200:
1264
+ try:
1265
+ id = response.json().get('id')
1266
+ except:
1267
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1268
+ return self._monitor_job_status(id, headers, poll_interval)
1269
+ else:
1270
+ self._handle_error(response, 'start batch scrape job')
1271
+
1272
+ def async_batch_scrape_urls(
1273
+ self,
1274
+ urls: List[str],
1275
+ *,
1276
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1277
+ headers: Optional[Dict[str, str]] = None,
1278
+ include_tags: Optional[List[str]] = None,
1279
+ exclude_tags: Optional[List[str]] = None,
1280
+ only_main_content: Optional[bool] = None,
1281
+ wait_for: Optional[int] = None,
1282
+ timeout: Optional[int] = None,
1283
+ location: Optional[LocationConfig] = None,
1284
+ mobile: Optional[bool] = None,
1285
+ skip_tls_verification: Optional[bool] = None,
1286
+ remove_base64_images: Optional[bool] = None,
1287
+ block_ads: Optional[bool] = None,
1288
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1289
+ extract: Optional[JsonConfig] = None,
1290
+ json_options: Optional[JsonConfig] = None,
1291
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1292
+ agent: Optional[AgentOptions] = None,
1293
+ idempotency_key: Optional[str] = None,
1294
+ **kwargs
1295
+ ) -> BatchScrapeResponse:
1296
+ """
1297
+ Initiate a batch scrape job asynchronously.
1298
+
1299
+ Args:
1300
+ urls (List[str]): URLs to scrape
1301
+ formats (Optional[List[Literal]]): Content formats to retrieve
1302
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1303
+ include_tags (Optional[List[str]]): HTML tags to include
1304
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1305
+ only_main_content (Optional[bool]): Extract main content only
1306
+ wait_for (Optional[int]): Wait time in milliseconds
1307
+ timeout (Optional[int]): Request timeout in milliseconds
1308
+ location (Optional[LocationConfig]): Location configuration
1309
+ mobile (Optional[bool]): Use mobile user agent
1310
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1311
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1312
+ block_ads (Optional[bool]): Block advertisements
1313
+ proxy (Optional[Literal]): Proxy type to use
1314
+ extract (Optional[JsonConfig]): Content extraction config
1315
+ json_options (Optional[JsonConfig]): JSON extraction config
1316
+ actions (Optional[List[Union]]): Actions to perform
1317
+ agent (Optional[AgentOptions]): Agent configuration
1318
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1319
+ **kwargs: Additional parameters to pass to the API
1320
+
1321
+ Returns:
1322
+ BatchScrapeResponse with:
1323
+ * success - Whether job started successfully
1324
+ * id - Unique identifier for the job
1325
+ * url - Status check URL
1326
+ * error - Error message if start failed
1327
+
1328
+ Raises:
1329
+ Exception: If job initiation fails
1330
+ """
1331
+ scrape_params = {}
1332
+
1333
+ # Add individual parameters
1334
+ if formats is not None:
1335
+ scrape_params['formats'] = formats
1336
+ if headers is not None:
1337
+ scrape_params['headers'] = headers
1338
+ if include_tags is not None:
1339
+ scrape_params['includeTags'] = include_tags
1340
+ if exclude_tags is not None:
1341
+ scrape_params['excludeTags'] = exclude_tags
1342
+ if only_main_content is not None:
1343
+ scrape_params['onlyMainContent'] = only_main_content
1344
+ if wait_for is not None:
1345
+ scrape_params['waitFor'] = wait_for
1346
+ if timeout is not None:
1347
+ scrape_params['timeout'] = timeout
1348
+ if location is not None:
1349
+ scrape_params['location'] = location.dict(exclude_none=True)
1350
+ if mobile is not None:
1351
+ scrape_params['mobile'] = mobile
1352
+ if skip_tls_verification is not None:
1353
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1354
+ if remove_base64_images is not None:
1355
+ scrape_params['removeBase64Images'] = remove_base64_images
1356
+ if block_ads is not None:
1357
+ scrape_params['blockAds'] = block_ads
1358
+ if proxy is not None:
1359
+ scrape_params['proxy'] = proxy
1360
+ if extract is not None:
1361
+ if hasattr(extract.schema, 'schema'):
1362
+ extract.schema = extract.schema.schema()
1363
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1364
+ if json_options is not None:
1365
+ if hasattr(json_options.schema, 'schema'):
1366
+ json_options.schema = json_options.schema.schema()
1367
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1368
+ if actions is not None:
1369
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1370
+ if agent is not None:
1371
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1372
+
1373
+ # Add any additional kwargs
1374
+ scrape_params.update(kwargs)
1375
+
1376
+ # Create final params object
1377
+ final_params = ScrapeParams(**scrape_params)
1378
+ params_dict = final_params.dict(exclude_none=True)
1379
+ params_dict['urls'] = urls
1380
+ params_dict['origin'] = f"python-sdk@{version}"
1381
+
1382
+ # Make request
1383
+ headers = self._prepare_headers(idempotency_key)
1384
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1385
+
1386
+ if response.status_code == 200:
1387
+ try:
1388
+ return BatchScrapeResponse(**response.json())
1389
+ except:
1390
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1391
+ else:
1392
+ self._handle_error(response, 'start batch scrape job')
1393
+
1394
+ def batch_scrape_urls_and_watch(
1395
+ self,
1396
+ urls: List[str],
1397
+ *,
1398
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1399
+ headers: Optional[Dict[str, str]] = None,
1400
+ include_tags: Optional[List[str]] = None,
1401
+ exclude_tags: Optional[List[str]] = None,
1402
+ only_main_content: Optional[bool] = None,
1403
+ wait_for: Optional[int] = None,
1404
+ timeout: Optional[int] = None,
1405
+ location: Optional[LocationConfig] = None,
1406
+ mobile: Optional[bool] = None,
1407
+ skip_tls_verification: Optional[bool] = None,
1408
+ remove_base64_images: Optional[bool] = None,
1409
+ block_ads: Optional[bool] = None,
1410
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1411
+ extract: Optional[JsonConfig] = None,
1412
+ json_options: Optional[JsonConfig] = None,
1413
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1414
+ agent: Optional[AgentOptions] = None,
1415
+ idempotency_key: Optional[str] = None,
1416
+ **kwargs
1417
+ ) -> 'CrawlWatcher':
1418
+ """
1419
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1420
+
1421
+ Args:
1422
+ urls (List[str]): URLs to scrape
1423
+ formats (Optional[List[Literal]]): Content formats to retrieve
1424
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1425
+ include_tags (Optional[List[str]]): HTML tags to include
1426
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1427
+ only_main_content (Optional[bool]): Extract main content only
1428
+ wait_for (Optional[int]): Wait time in milliseconds
1429
+ timeout (Optional[int]): Request timeout in milliseconds
1430
+ location (Optional[LocationConfig]): Location configuration
1431
+ mobile (Optional[bool]): Use mobile user agent
1432
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1433
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1434
+ block_ads (Optional[bool]): Block advertisements
1435
+ proxy (Optional[Literal]): Proxy type to use
1436
+ extract (Optional[JsonConfig]): Content extraction config
1437
+ json_options (Optional[JsonConfig]): JSON extraction config
1438
+ actions (Optional[List[Union]]): Actions to perform
1439
+ agent (Optional[AgentOptions]): Agent configuration
1440
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1441
+ **kwargs: Additional parameters to pass to the API
1442
+
1443
+ Returns:
1444
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1445
+
1446
+ Raises:
1447
+ Exception: If batch scrape job fails to start
1448
+ """
1449
+ scrape_params = {}
1450
+
1451
+ # Add individual parameters
1452
+ if formats is not None:
1453
+ scrape_params['formats'] = formats
1454
+ if headers is not None:
1455
+ scrape_params['headers'] = headers
1456
+ if include_tags is not None:
1457
+ scrape_params['includeTags'] = include_tags
1458
+ if exclude_tags is not None:
1459
+ scrape_params['excludeTags'] = exclude_tags
1460
+ if only_main_content is not None:
1461
+ scrape_params['onlyMainContent'] = only_main_content
1462
+ if wait_for is not None:
1463
+ scrape_params['waitFor'] = wait_for
1464
+ if timeout is not None:
1465
+ scrape_params['timeout'] = timeout
1466
+ if location is not None:
1467
+ scrape_params['location'] = location.dict(exclude_none=True)
1468
+ if mobile is not None:
1469
+ scrape_params['mobile'] = mobile
1470
+ if skip_tls_verification is not None:
1471
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1472
+ if remove_base64_images is not None:
1473
+ scrape_params['removeBase64Images'] = remove_base64_images
1474
+ if block_ads is not None:
1475
+ scrape_params['blockAds'] = block_ads
1476
+ if proxy is not None:
1477
+ scrape_params['proxy'] = proxy
1478
+ if extract is not None:
1479
+ if hasattr(extract.schema, 'schema'):
1480
+ extract.schema = extract.schema.schema()
1481
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1482
+ if json_options is not None:
1483
+ if hasattr(json_options.schema, 'schema'):
1484
+ json_options.schema = json_options.schema.schema()
1485
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1486
+ if actions is not None:
1487
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1488
+ if agent is not None:
1489
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1490
+
1491
+ # Add any additional kwargs
1492
+ scrape_params.update(kwargs)
1493
+
1494
+ # Create final params object
1495
+ final_params = ScrapeParams(**scrape_params)
1496
+ params_dict = final_params.dict(exclude_none=True)
1497
+ params_dict['urls'] = urls
1498
+ params_dict['origin'] = f"python-sdk@{version}"
1499
+
1500
+ # Make request
1501
+ headers = self._prepare_headers(idempotency_key)
1502
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1503
+
1504
+ if response.status_code == 200:
1505
+ try:
1506
+ crawl_response = BatchScrapeResponse(**response.json())
1507
+ if crawl_response.success and crawl_response.id:
1508
+ return CrawlWatcher(crawl_response.id, self)
1509
+ else:
1510
+ raise Exception("Batch scrape job failed to start")
1511
+ except:
1512
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1513
+ else:
1514
+ self._handle_error(response, 'start batch scrape job')
1515
+
1516
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1517
+ """
1518
+ Check the status of a batch scrape job using the Firecrawl API.
1519
+
1520
+ Args:
1521
+ id (str): The ID of the batch scrape job.
1522
+
1523
+ Returns:
1524
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1525
+
1526
+ Raises:
1527
+ Exception: If the status check request fails.
1528
+ """
1529
+ endpoint = f'/v1/batch/scrape/{id}'
1530
+
1531
+ headers = self._prepare_headers()
1532
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1533
+ if response.status_code == 200:
1534
+ try:
1535
+ status_data = response.json()
1536
+ except:
1537
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1538
+ if status_data['status'] == 'completed':
1539
+ if 'data' in status_data:
1540
+ data = status_data['data']
1541
+ while 'next' in status_data:
1542
+ if len(status_data['data']) == 0:
1543
+ break
1544
+ next_url = status_data.get('next')
1545
+ if not next_url:
1546
+ logger.warning("Expected 'next' URL is missing.")
1547
+ break
1548
+ try:
1549
+ status_response = self._get_request(next_url, headers)
1550
+ if status_response.status_code != 200:
1551
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1552
+ break
1553
+ try:
1554
+ next_data = status_response.json()
1555
+ except:
1556
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1557
+ data.extend(next_data.get('data', []))
1558
+ status_data = next_data
1559
+ except Exception as e:
1560
+ logger.error(f"Error during pagination request: {e}")
1561
+ break
1562
+ status_data['data'] = data
1563
+
1564
+ return BatchScrapeStatusResponse(**{
1565
+ 'success': False if 'error' in status_data else True,
1566
+ 'status': status_data.get('status'),
1567
+ 'total': status_data.get('total'),
1568
+ 'completed': status_data.get('completed'),
1569
+ 'creditsUsed': status_data.get('creditsUsed'),
1570
+ 'expiresAt': status_data.get('expiresAt'),
1571
+ 'data': status_data.get('data'),
1572
+ 'next': status_data.get('next'),
1573
+ 'error': status_data.get('error')
1574
+ })
1575
+ else:
1576
+ self._handle_error(response, 'check batch scrape status')
1577
+
1578
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1579
+ """
1580
+ Returns information about batch scrape errors.
1581
+
1582
+ Args:
1583
+ id (str): The ID of the crawl job.
1584
+
1585
+ Returns:
1586
+ CrawlErrorsResponse: A response containing:
1587
+ * errors (List[Dict[str, str]]): List of errors with fields:
1588
+ * id (str): Error ID
1589
+ * timestamp (str): When the error occurred
1590
+ * url (str): URL that caused the error
1591
+ * error (str): Error message
1592
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1593
+
1594
+ Raises:
1595
+ Exception: If the error check request fails
1596
+ """
1597
+ headers = self._prepare_headers()
1598
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1599
+ if response.status_code == 200:
1600
+ try:
1601
+ return CrawlErrorsResponse(**response.json())
1602
+ except:
1603
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1604
+ else:
1605
+ self._handle_error(response, "check batch scrape errors")
1606
+
1607
+ def extract(
1608
+ self,
1609
+ urls: Optional[List[str]] = None,
1610
+ *,
1611
+ prompt: Optional[str] = None,
1612
+ schema: Optional[Any] = None,
1613
+ system_prompt: Optional[str] = None,
1614
+ allow_external_links: Optional[bool] = False,
1615
+ enable_web_search: Optional[bool] = False,
1616
+ show_sources: Optional[bool] = False,
1617
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1618
+ """
1619
+ Extract structured information from URLs.
1620
+
1621
+ Args:
1622
+ urls (Optional[List[str]]): URLs to extract from
1623
+ prompt (Optional[str]): Custom extraction prompt
1624
+ schema (Optional[Any]): JSON schema/Pydantic model
1625
+ system_prompt (Optional[str]): System context
1626
+ allow_external_links (Optional[bool]): Follow external links
1627
+ enable_web_search (Optional[bool]): Enable web search
1628
+ show_sources (Optional[bool]): Include source URLs
1629
+ agent (Optional[Dict[str, Any]]): Agent configuration
1630
+
1631
+ Returns:
1632
+ ExtractResponse[Any] with:
1633
+ * success (bool): Whether request succeeded
1634
+ * data (Optional[Any]): Extracted data matching schema
1635
+ * error (Optional[str]): Error message if any
1636
+
1637
+ Raises:
1638
+ ValueError: If prompt/schema missing or extraction fails
1639
+ """
1640
+ headers = self._prepare_headers()
1641
+
1642
+ if not prompt and not schema:
1643
+ raise ValueError("Either prompt or schema is required")
1644
+
1645
+ if not urls and not prompt:
1646
+ raise ValueError("Either urls or prompt is required")
1647
+
1648
+ if schema:
1649
+ if hasattr(schema, 'model_json_schema'):
1650
+ # Convert Pydantic model to JSON schema
1651
+ schema = schema.model_json_schema()
1652
+ # Otherwise assume it's already a JSON schema dict
1653
+
1654
+ request_data = {
1655
+ 'urls': urls or [],
1656
+ 'allowExternalLinks': allow_external_links,
1657
+ 'enableWebSearch': enable_web_search,
1658
+ 'showSources': show_sources,
1659
+ 'schema': schema,
1660
+ 'origin': f'python-sdk@{get_version()}'
1661
+ }
1662
+
1663
+ # Only add prompt and systemPrompt if they exist
1664
+ if prompt:
1665
+ request_data['prompt'] = prompt
1666
+ if system_prompt:
1667
+ request_data['systemPrompt'] = system_prompt
1668
+
1669
+ if agent:
1670
+ request_data['agent'] = agent
1671
+
1672
+ try:
1673
+ # Send the initial extract request
1674
+ response = self._post_request(
1675
+ f'{self.api_url}/v1/extract',
1676
+ request_data,
1677
+ headers
1678
+ )
1679
+ if response.status_code == 200:
1680
+ try:
1681
+ data = response.json()
1682
+ except:
1683
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1684
+ if data['success']:
1685
+ job_id = data.get('id')
1686
+ if not job_id:
1687
+ raise Exception('Job ID not returned from extract request.')
1688
+
1689
+ # Poll for the extract status
1690
+ while True:
1691
+ status_response = self._get_request(
1692
+ f'{self.api_url}/v1/extract/{job_id}',
1693
+ headers
1694
+ )
1695
+ if status_response.status_code == 200:
1696
+ try:
1697
+ status_data = status_response.json()
1698
+ except:
1699
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1700
+ if status_data['status'] == 'completed':
1701
+ return ExtractResponse(**status_data)
1702
+ elif status_data['status'] in ['failed', 'cancelled']:
1703
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1704
+ else:
1705
+ self._handle_error(status_response, "extract-status")
1706
+
1707
+ time.sleep(2) # Polling interval
1708
+ else:
1709
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1710
+ else:
1711
+ self._handle_error(response, "extract")
1712
+ except Exception as e:
1713
+ raise ValueError(str(e), 500)
1714
+
1715
+ return ExtractResponse(success=False, error="Internal server error.")
1716
+
1717
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1718
+ """
1719
+ Retrieve the status of an extract job.
1720
+
1721
+ Args:
1722
+ job_id (str): The ID of the extract job.
1723
+
1724
+ Returns:
1725
+ ExtractResponse[Any]: The status of the extract job.
1726
+
1727
+ Raises:
1728
+ ValueError: If there is an error retrieving the status.
1729
+ """
1730
+ headers = self._prepare_headers()
1731
+ try:
1732
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1733
+ if response.status_code == 200:
1734
+ try:
1735
+ return ExtractResponse(**response.json())
1736
+ except:
1737
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1738
+ else:
1739
+ self._handle_error(response, "get extract status")
1740
+ except Exception as e:
1741
+ raise ValueError(str(e), 500)
1742
+
1743
+ def async_extract(
1744
+ self,
1745
+ urls: Optional[List[str]] = None,
1746
+ *,
1747
+ prompt: Optional[str] = None,
1748
+ schema: Optional[Any] = None,
1749
+ system_prompt: Optional[str] = None,
1750
+ allow_external_links: Optional[bool] = False,
1751
+ enable_web_search: Optional[bool] = False,
1752
+ show_sources: Optional[bool] = False,
1753
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1754
+ """
1755
+ Initiate an asynchronous extract job.
1756
+
1757
+ Args:
1758
+ urls (List[str]): URLs to extract information from
1759
+ prompt (Optional[str]): Custom extraction prompt
1760
+ schema (Optional[Any]): JSON schema/Pydantic model
1761
+ system_prompt (Optional[str]): System context
1762
+ allow_external_links (Optional[bool]): Follow external links
1763
+ enable_web_search (Optional[bool]): Enable web search
1764
+ show_sources (Optional[bool]): Include source URLs
1765
+ agent (Optional[Dict[str, Any]]): Agent configuration
1766
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1767
+
1768
+ Returns:
1769
+ ExtractResponse[Any] with:
1770
+ * success (bool): Whether request succeeded
1771
+ * data (Optional[Any]): Extracted data matching schema
1772
+ * error (Optional[str]): Error message if any
1773
+
1774
+ Raises:
1775
+ ValueError: If job initiation fails
1776
+ """
1777
+ headers = self._prepare_headers()
1778
+
1779
+ schema = schema
1780
+ if schema:
1781
+ if hasattr(schema, 'model_json_schema'):
1782
+ # Convert Pydantic model to JSON schema
1783
+ schema = schema.model_json_schema()
1784
+ # Otherwise assume it's already a JSON schema dict
1785
+
1786
+ request_data = {
1787
+ 'urls': urls,
1788
+ 'allowExternalLinks': allow_external_links,
1789
+ 'enableWebSearch': enable_web_search,
1790
+ 'showSources': show_sources,
1791
+ 'schema': schema,
1792
+ 'origin': f'python-sdk@{version}'
1793
+ }
1794
+
1795
+ if prompt:
1796
+ request_data['prompt'] = prompt
1797
+ if system_prompt:
1798
+ request_data['systemPrompt'] = system_prompt
1799
+ if agent:
1800
+ request_data['agent'] = agent
1801
+
1802
+ try:
1803
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1804
+ if response.status_code == 200:
1805
+ try:
1806
+ return ExtractResponse(**response.json())
1807
+ except:
1808
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1809
+ else:
1810
+ self._handle_error(response, "async extract")
1811
+ except Exception as e:
1812
+ raise ValueError(str(e), 500)
1813
+
1814
+ def generate_llms_text(
1815
+ self,
1816
+ url: str,
1817
+ *,
1818
+ max_urls: Optional[int] = None,
1819
+ show_full_text: Optional[bool] = None,
1820
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1821
+ """
1822
+ Generate LLMs.txt for a given URL and poll until completion.
1823
+
1824
+ Args:
1825
+ url (str): Target URL to generate LLMs.txt from
1826
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1827
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1828
+ experimental_stream (Optional[bool]): Enable experimental streaming
1829
+
1830
+ Returns:
1831
+ GenerateLLMsTextStatusResponse with:
1832
+ * Generated LLMs.txt content
1833
+ * Full version if requested
1834
+ * Generation status
1835
+ * Success/error information
1836
+
1837
+ Raises:
1838
+ Exception: If generation fails
1839
+ """
1840
+ params = GenerateLLMsTextParams(
1841
+ maxUrls=max_urls,
1842
+ showFullText=show_full_text,
1843
+ __experimental_stream=experimental_stream
1844
+ )
1845
+
1846
+ response = self.async_generate_llms_text(
1847
+ url,
1848
+ max_urls=max_urls,
1849
+ show_full_text=show_full_text,
1850
+ experimental_stream=experimental_stream
1851
+ )
1852
+
1853
+ if not response.success or not response.id:
1854
+ return GenerateLLMsTextStatusResponse(
1855
+ success=False,
1856
+ error='Failed to start LLMs.txt generation',
1857
+ status='failed',
1858
+ expiresAt=''
1859
+ )
1860
+
1861
+ job_id = response.id
1862
+ while True:
1863
+ status = self.check_generate_llms_text_status(job_id)
1864
+
1865
+ if status.status == 'completed':
1866
+ return status
1867
+ elif status.status == 'failed':
1868
+ return status
1869
+ elif status.status != 'processing':
1870
+ return GenerateLLMsTextStatusResponse(
1871
+ success=False,
1872
+ error='LLMs.txt generation job terminated unexpectedly',
1873
+ status='failed',
1874
+ expiresAt=''
1875
+ )
1876
+
1877
+ time.sleep(2) # Polling interval
1878
+
1879
+ def async_generate_llms_text(
1880
+ self,
1881
+ url: str,
1882
+ *,
1883
+ max_urls: Optional[int] = None,
1884
+ show_full_text: Optional[bool] = None,
1885
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1886
+ """
1887
+ Initiate an asynchronous LLMs.txt generation operation.
1888
+
1889
+ Args:
1890
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1891
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1892
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1893
+ experimental_stream (Optional[bool]): Enable experimental streaming
1894
+
1895
+ Returns:
1896
+ GenerateLLMsTextResponse: A response containing:
1897
+ * success (bool): Whether the generation initiation was successful
1898
+ * id (str): The unique identifier for the generation job
1899
+ * error (str, optional): Error message if initiation failed
1900
+
1901
+ Raises:
1902
+ Exception: If the generation job initiation fails.
1903
+ """
1904
+ params = GenerateLLMsTextParams(
1905
+ maxUrls=max_urls,
1906
+ showFullText=show_full_text,
1907
+ __experimental_stream=experimental_stream
1908
+ )
1909
+
1910
+ headers = self._prepare_headers()
1911
+ json_data = {'url': url, **params.dict(exclude_none=True)}
1912
+ json_data['origin'] = f"python-sdk@{version}"
1913
+
1914
+ try:
1915
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1916
+ response = req.json()
1917
+ print("json_data", json_data)
1918
+ print("response", response)
1919
+ if response.get('success'):
1920
+ try:
1921
+ return GenerateLLMsTextResponse(**response)
1922
+ except:
1923
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1924
+ else:
1925
+ self._handle_error(response, 'start LLMs.txt generation')
1926
+ except Exception as e:
1927
+ raise ValueError(str(e))
1928
+
1929
+ return GenerateLLMsTextResponse(
1930
+ success=False,
1931
+ error='Internal server error'
1932
+ )
1933
+
1934
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
1935
+ """
1936
+ Check the status of a LLMs.txt generation operation.
1937
+
1938
+ Args:
1939
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
1940
+
1941
+ Returns:
1942
+ GenerateLLMsTextStatusResponse: A response containing:
1943
+ * success (bool): Whether the generation was successful
1944
+ * status (str): Status of generation ("processing", "completed", "failed")
1945
+ * data (Dict[str, str], optional): Generated text with fields:
1946
+ * llmstxt (str): Generated LLMs.txt content
1947
+ * llmsfulltxt (str, optional): Full version if requested
1948
+ * error (str, optional): Error message if generation failed
1949
+ * expiresAt (str): When the generated data expires
1950
+
1951
+ Raises:
1952
+ Exception: If the status check fails.
1953
+ """
1954
+ headers = self._prepare_headers()
1955
+ try:
1956
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
1957
+ if response.status_code == 200:
1958
+ try:
1959
+ json_data = response.json()
1960
+ return GenerateLLMsTextStatusResponse(**json_data)
1961
+ except Exception as e:
1962
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
1963
+ elif response.status_code == 404:
1964
+ raise Exception('LLMs.txt generation job not found')
1965
+ else:
1966
+ self._handle_error(response, 'check LLMs.txt generation status')
1967
+ except Exception as e:
1968
+ raise ValueError(str(e))
1969
+
1970
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
1971
+
1972
+ def _prepare_headers(
1973
+ self,
1974
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
1975
+ """
1976
+ Prepare the headers for API requests.
1977
+
1978
+ Args:
1979
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
1980
+
1981
+ Returns:
1982
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
1983
+ """
1984
+ if idempotency_key:
1985
+ return {
1986
+ 'Content-Type': 'application/json',
1987
+ 'Authorization': f'Bearer {self.api_key}',
1988
+ 'x-idempotency-key': idempotency_key
1989
+ }
1990
+
1991
+ return {
1992
+ 'Content-Type': 'application/json',
1993
+ 'Authorization': f'Bearer {self.api_key}',
1994
+ }
1995
+
1996
+ def _post_request(
1997
+ self,
1998
+ url: str,
1999
+ data: Dict[str, Any],
2000
+ headers: Dict[str, str],
2001
+ retries: int = 3,
2002
+ backoff_factor: float = 0.5) -> requests.Response:
2003
+ """
2004
+ Make a POST request with retries.
2005
+
2006
+ Args:
2007
+ url (str): The URL to send the POST request to.
2008
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2009
+ headers (Dict[str, str]): The headers to include in the POST request.
2010
+ retries (int): Number of retries for the request.
2011
+ backoff_factor (float): Backoff factor for retries.
2012
+
2013
+ Returns:
2014
+ requests.Response: The response from the POST request.
2015
+
2016
+ Raises:
2017
+ requests.RequestException: If the request fails after the specified retries.
2018
+ """
2019
+ for attempt in range(retries):
2020
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2021
+ if response.status_code == 502:
2022
+ time.sleep(backoff_factor * (2 ** attempt))
2023
+ else:
2024
+ return response
2025
+ return response
2026
+
2027
+ def _get_request(
2028
+ self,
2029
+ url: str,
2030
+ headers: Dict[str, str],
2031
+ retries: int = 3,
2032
+ backoff_factor: float = 0.5) -> requests.Response:
2033
+ """
2034
+ Make a GET request with retries.
2035
+
2036
+ Args:
2037
+ url (str): The URL to send the GET request to.
2038
+ headers (Dict[str, str]): The headers to include in the GET request.
2039
+ retries (int): Number of retries for the request.
2040
+ backoff_factor (float): Backoff factor for retries.
2041
+
2042
+ Returns:
2043
+ requests.Response: The response from the GET request.
2044
+
2045
+ Raises:
2046
+ requests.RequestException: If the request fails after the specified retries.
2047
+ """
2048
+ for attempt in range(retries):
2049
+ response = requests.get(url, headers=headers)
2050
+ if response.status_code == 502:
2051
+ time.sleep(backoff_factor * (2 ** attempt))
2052
+ else:
2053
+ return response
2054
+ return response
2055
+
2056
+ def _delete_request(
2057
+ self,
2058
+ url: str,
2059
+ headers: Dict[str, str],
2060
+ retries: int = 3,
2061
+ backoff_factor: float = 0.5) -> requests.Response:
2062
+ """
2063
+ Make a DELETE request with retries.
2064
+
2065
+ Args:
2066
+ url (str): The URL to send the DELETE request to.
2067
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2068
+ retries (int): Number of retries for the request.
2069
+ backoff_factor (float): Backoff factor for retries.
2070
+
2071
+ Returns:
2072
+ requests.Response: The response from the DELETE request.
2073
+
2074
+ Raises:
2075
+ requests.RequestException: If the request fails after the specified retries.
2076
+ """
2077
+ for attempt in range(retries):
2078
+ response = requests.delete(url, headers=headers)
2079
+ if response.status_code == 502:
2080
+ time.sleep(backoff_factor * (2 ** attempt))
2081
+ else:
2082
+ return response
2083
+ return response
2084
+
2085
+ def _monitor_job_status(
2086
+ self,
2087
+ id: str,
2088
+ headers: Dict[str, str],
2089
+ poll_interval: int) -> CrawlStatusResponse:
2090
+ """
2091
+ Monitor the status of a crawl job until completion.
2092
+
2093
+ Args:
2094
+ id (str): The ID of the crawl job.
2095
+ headers (Dict[str, str]): The headers to include in the status check requests.
2096
+ poll_interval (int): Seconds between status checks.
2097
+
2098
+ Returns:
2099
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2100
+
2101
+ Raises:
2102
+ Exception: If the job fails or an error occurs during status checks.
2103
+ """
2104
+ while True:
2105
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2106
+
2107
+ status_response = self._get_request(api_url, headers)
2108
+ if status_response.status_code == 200:
2109
+ try:
2110
+ status_data = status_response.json()
2111
+ except:
2112
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2113
+ if status_data['status'] == 'completed':
2114
+ if 'data' in status_data:
2115
+ data = status_data['data']
2116
+ while 'next' in status_data:
2117
+ if len(status_data['data']) == 0:
2118
+ break
2119
+ status_response = self._get_request(status_data['next'], headers)
2120
+ try:
2121
+ status_data = status_response.json()
2122
+ except:
2123
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2124
+ data.extend(status_data.get('data', []))
2125
+ status_data['data'] = data
2126
+ return CrawlStatusResponse(**status_data)
2127
+ else:
2128
+ raise Exception('Crawl job completed but no data was returned')
2129
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2130
+ poll_interval=max(poll_interval,2)
2131
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2132
+ else:
2133
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2134
+ else:
2135
+ self._handle_error(status_response, 'check crawl status')
2136
+
2137
+ def _handle_error(
2138
+ self,
2139
+ response: requests.Response,
2140
+ action: str) -> None:
2141
+ """
2142
+ Handle errors from API responses.
2143
+
2144
+ Args:
2145
+ response (requests.Response): The response object from the API request.
2146
+ action (str): Description of the action that was being performed.
2147
+
2148
+ Raises:
2149
+ Exception: An exception with a message containing the status code and error details from the response.
2150
+ """
2151
+ try:
2152
+ error_message = response.json().get('error', 'No error message provided.')
2153
+ error_details = response.json().get('details', 'No additional error details provided.')
2154
+ except:
2155
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2156
+
2157
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2158
+
2159
+ # Raise an HTTPError with the custom message and attach the response
2160
+ raise requests.exceptions.HTTPError(message, response=response)
2161
+
2162
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2163
+ """
2164
+ Generate a standardized error message based on HTTP status code.
2165
+
2166
+ Args:
2167
+ status_code (int): The HTTP status code from the response
2168
+ action (str): Description of the action that was being performed
2169
+ error_message (str): The error message from the API response
2170
+ error_details (str): Additional error details from the API response
2171
+
2172
+ Returns:
2173
+ str: A formatted error message
2174
+ """
2175
+ if status_code == 402:
2176
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2177
+ elif status_code == 403:
2178
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2179
+ elif status_code == 408:
2180
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2181
+ elif status_code == 409:
2182
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2183
+ elif status_code == 500:
2184
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2185
+ else:
2186
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2187
+
2188
+ def deep_research(
2189
+ self,
2190
+ query: str,
2191
+ *,
2192
+ max_depth: Optional[int] = None,
2193
+ time_limit: Optional[int] = None,
2194
+ max_urls: Optional[int] = None,
2195
+ analysis_prompt: Optional[str] = None,
2196
+ system_prompt: Optional[str] = None,
2197
+ __experimental_stream_steps: Optional[bool] = None,
2198
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2199
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2200
+ """
2201
+ Initiates a deep research operation on a given query and polls until completion.
2202
+
2203
+ Args:
2204
+ query (str): Research query or topic to investigate
2205
+ max_depth (Optional[int]): Maximum depth of research exploration
2206
+ time_limit (Optional[int]): Time limit in seconds for research
2207
+ max_urls (Optional[int]): Maximum number of URLs to process
2208
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2209
+ system_prompt (Optional[str]): Custom system prompt
2210
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2211
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2212
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2213
+
2214
+ Returns:
2215
+ DeepResearchStatusResponse containing:
2216
+ * success (bool): Whether research completed successfully
2217
+ * status (str): Current state (processing/completed/failed)
2218
+ * error (Optional[str]): Error message if failed
2219
+ * id (str): Unique identifier for the research job
2220
+ * data (Any): Research findings and analysis
2221
+ * sources (List[Dict]): List of discovered sources
2222
+ * activities (List[Dict]): Research progress log
2223
+ * summaries (List[str]): Generated research summaries
2224
+
2225
+ Raises:
2226
+ Exception: If research fails
2227
+ """
2228
+ research_params = {}
2229
+ if max_depth is not None:
2230
+ research_params['maxDepth'] = max_depth
2231
+ if time_limit is not None:
2232
+ research_params['timeLimit'] = time_limit
2233
+ if max_urls is not None:
2234
+ research_params['maxUrls'] = max_urls
2235
+ if analysis_prompt is not None:
2236
+ research_params['analysisPrompt'] = analysis_prompt
2237
+ if system_prompt is not None:
2238
+ research_params['systemPrompt'] = system_prompt
2239
+ if __experimental_stream_steps is not None:
2240
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2241
+ research_params = DeepResearchParams(**research_params)
2242
+
2243
+ response = self.async_deep_research(
2244
+ query,
2245
+ max_depth=max_depth,
2246
+ time_limit=time_limit,
2247
+ max_urls=max_urls,
2248
+ analysis_prompt=analysis_prompt,
2249
+ system_prompt=system_prompt
2250
+ )
2251
+ if not response.get('success') or 'id' not in response:
2252
+ return response
2253
+
2254
+ job_id = response['id']
2255
+ last_activity_count = 0
2256
+ last_source_count = 0
2257
+
2258
+ while True:
2259
+ status = self.check_deep_research_status(job_id)
2260
+
2261
+ if on_activity and 'activities' in status:
2262
+ new_activities = status['activities'][last_activity_count:]
2263
+ for activity in new_activities:
2264
+ on_activity(activity)
2265
+ last_activity_count = len(status['activities'])
2266
+
2267
+ if on_source and 'sources' in status:
2268
+ new_sources = status['sources'][last_source_count:]
2269
+ for source in new_sources:
2270
+ on_source(source)
2271
+ last_source_count = len(status['sources'])
2272
+
2273
+ if status['status'] == 'completed':
2274
+ return status
2275
+ elif status['status'] == 'failed':
2276
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2277
+ elif status['status'] != 'processing':
2278
+ break
2279
+
2280
+ time.sleep(2) # Polling interval
2281
+
2282
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2283
+
2284
+ def async_deep_research(
2285
+ self,
2286
+ query: str,
2287
+ *,
2288
+ max_depth: Optional[int] = None,
2289
+ time_limit: Optional[int] = None,
2290
+ max_urls: Optional[int] = None,
2291
+ analysis_prompt: Optional[str] = None,
2292
+ system_prompt: Optional[str] = None,
2293
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2294
+ """
2295
+ Initiates an asynchronous deep research operation.
2296
+
2297
+ Args:
2298
+ query (str): Research query or topic to investigate
2299
+ max_depth (Optional[int]): Maximum depth of research exploration
2300
+ time_limit (Optional[int]): Time limit in seconds for research
2301
+ max_urls (Optional[int]): Maximum number of URLs to process
2302
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2303
+ system_prompt (Optional[str]): Custom system prompt
2304
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2305
+
2306
+ Returns:
2307
+ Dict[str, Any]: A response containing:
2308
+ * success (bool): Whether the research initiation was successful
2309
+ * id (str): The unique identifier for the research job
2310
+ * error (str, optional): Error message if initiation failed
2311
+
2312
+ Raises:
2313
+ Exception: If the research initiation fails.
2314
+ """
2315
+ research_params = {}
2316
+ if max_depth is not None:
2317
+ research_params['maxDepth'] = max_depth
2318
+ if time_limit is not None:
2319
+ research_params['timeLimit'] = time_limit
2320
+ if max_urls is not None:
2321
+ research_params['maxUrls'] = max_urls
2322
+ if analysis_prompt is not None:
2323
+ research_params['analysisPrompt'] = analysis_prompt
2324
+ if system_prompt is not None:
2325
+ research_params['systemPrompt'] = system_prompt
2326
+ if __experimental_stream_steps is not None:
2327
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2328
+ research_params = DeepResearchParams(**research_params)
2329
+
2330
+ headers = self._prepare_headers()
2331
+
2332
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2333
+ json_data['origin'] = f"python-sdk@{version}"
2334
+
2335
+ # Handle json options schema if present
2336
+ if 'jsonOptions' in json_data:
2337
+ json_opts = json_data['jsonOptions']
2338
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2339
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2340
+
2341
+ try:
2342
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2343
+ if response.status_code == 200:
2344
+ try:
2345
+ return response.json()
2346
+ except:
2347
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2348
+ else:
2349
+ self._handle_error(response, 'start deep research')
2350
+ except Exception as e:
2351
+ raise ValueError(str(e))
2352
+
2353
+ return {'success': False, 'error': 'Internal server error'}
2354
+
2355
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2356
+ """
2357
+ Check the status of a deep research operation.
2358
+
2359
+ Args:
2360
+ id (str): The ID of the deep research operation.
2361
+
2362
+ Returns:
2363
+ DeepResearchResponse containing:
2364
+
2365
+ Status:
2366
+ * success - Whether research completed successfully
2367
+ * status - Current state (processing/completed/failed)
2368
+ * error - Error message if failed
2369
+
2370
+ Results:
2371
+ * id - Unique identifier for the research job
2372
+ * data - Research findings and analysis
2373
+ * sources - List of discovered sources
2374
+ * activities - Research progress log
2375
+ * summaries - Generated research summaries
2376
+
2377
+ Raises:
2378
+ Exception: If the status check fails.
2379
+ """
2380
+ headers = self._prepare_headers()
2381
+ try:
2382
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2383
+ if response.status_code == 200:
2384
+ try:
2385
+ return response.json()
2386
+ except:
2387
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2388
+ elif response.status_code == 404:
2389
+ raise Exception('Deep research job not found')
2390
+ else:
2391
+ self._handle_error(response, 'check deep research status')
2392
+ except Exception as e:
2393
+ raise ValueError(str(e))
2394
+
2395
+ return {'success': False, 'error': 'Internal server error'}
2396
+
2397
+ class CrawlWatcher:
2398
+ """
2399
+ A class to watch and handle crawl job events via WebSocket connection.
2400
+
2401
+ Attributes:
2402
+ id (str): The ID of the crawl job to watch
2403
+ app (FirecrawlApp): The FirecrawlApp instance
2404
+ data (List[Dict[str, Any]]): List of crawled documents/data
2405
+ status (str): Current status of the crawl job
2406
+ ws_url (str): WebSocket URL for the crawl job
2407
+ event_handlers (dict): Dictionary of event type to list of handler functions
2408
+ """
2409
+ def __init__(self, id: str, app: FirecrawlApp):
2410
+ self.id = id
2411
+ self.app = app
2412
+ self.data: List[Dict[str, Any]] = []
2413
+ self.status = "scraping"
2414
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2415
+ self.event_handlers = {
2416
+ 'done': [],
2417
+ 'error': [],
2418
+ 'document': []
2419
+ }
2420
+
2421
+ async def connect(self) -> None:
2422
+ """
2423
+ Establishes WebSocket connection and starts listening for messages.
2424
+ """
2425
+ async with websockets.connect(
2426
+ self.ws_url,
2427
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2428
+ ) as websocket:
2429
+ await self._listen(websocket)
2430
+
2431
+ async def _listen(self, websocket) -> None:
2432
+ """
2433
+ Listens for incoming WebSocket messages and handles them.
2434
+
2435
+ Args:
2436
+ websocket: The WebSocket connection object
2437
+ """
2438
+ async for message in websocket:
2439
+ msg = json.loads(message)
2440
+ await self._handle_message(msg)
2441
+
2442
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2443
+ """
2444
+ Adds an event handler function for a specific event type.
2445
+
2446
+ Args:
2447
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2448
+ handler (Callable): Function to handle the event
2449
+ """
2450
+ if event_type in self.event_handlers:
2451
+ self.event_handlers[event_type].append(handler)
2452
+
2453
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2454
+ """
2455
+ Dispatches an event to all registered handlers for that event type.
2456
+
2457
+ Args:
2458
+ event_type (str): Type of event to dispatch
2459
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2460
+ """
2461
+ if event_type in self.event_handlers:
2462
+ for handler in self.event_handlers[event_type]:
2463
+ handler(detail)
2464
+
2465
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2466
+ """
2467
+ Handles incoming WebSocket messages based on their type.
2468
+
2469
+ Args:
2470
+ msg (Dict[str, Any]): The message to handle
2471
+ """
2472
+ if msg['type'] == 'done':
2473
+ self.status = 'completed'
2474
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2475
+ elif msg['type'] == 'error':
2476
+ self.status = 'failed'
2477
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2478
+ elif msg['type'] == 'catchup':
2479
+ self.status = msg['data']['status']
2480
+ self.data.extend(msg['data'].get('data', []))
2481
+ for doc in self.data:
2482
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2483
+ elif msg['type'] == 'document':
2484
+ self.data.append(msg['data'])
2485
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2486
+
2487
+ class AsyncFirecrawlApp(FirecrawlApp):
2488
+ """
2489
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2490
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2491
+ """
2492
+
2493
+ async def _async_request(
2494
+ self,
2495
+ method: str,
2496
+ url: str,
2497
+ headers: Dict[str, str],
2498
+ data: Optional[Dict[str, Any]] = None,
2499
+ retries: int = 3,
2500
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2501
+ """
2502
+ Generic async request method with exponential backoff retry logic.
2503
+
2504
+ Args:
2505
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2506
+ url (str): The URL to send the request to.
2507
+ headers (Dict[str, str]): Headers to include in the request.
2508
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2509
+ retries (int): Maximum number of retry attempts (default: 3).
2510
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2511
+ Delay will be backoff_factor * (2 ** retry_count).
2512
+
2513
+ Returns:
2514
+ Dict[str, Any]: The parsed JSON response from the server.
2515
+
2516
+ Raises:
2517
+ aiohttp.ClientError: If the request fails after all retries.
2518
+ Exception: If max retries are exceeded or other errors occur.
2519
+ """
2520
+ async with aiohttp.ClientSession() as session:
2521
+ for attempt in range(retries):
2522
+ try:
2523
+ async with session.request(
2524
+ method=method, url=url, headers=headers, json=data
2525
+ ) as response:
2526
+ if response.status == 502:
2527
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2528
+ continue
2529
+ if response.status >= 300:
2530
+ await self._handle_error(response, f"make {method} request")
2531
+ return await response.json()
2532
+ except aiohttp.ClientError as e:
2533
+ if attempt == retries - 1:
2534
+ raise e
2535
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2536
+ raise Exception("Max retries exceeded")
2537
+
2538
+ async def _async_post_request(
2539
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2540
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2541
+ """
2542
+ Make an async POST request with exponential backoff retry logic.
2543
+
2544
+ Args:
2545
+ url (str): The URL to send the POST request to.
2546
+ data (Dict[str, Any]): The JSON data to include in the request body.
2547
+ headers (Dict[str, str]): Headers to include in the request.
2548
+ retries (int): Maximum number of retry attempts (default: 3).
2549
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2550
+ Delay will be backoff_factor * (2 ** retry_count).
2551
+
2552
+ Returns:
2553
+ Dict[str, Any]: The parsed JSON response from the server.
2554
+
2555
+ Raises:
2556
+ aiohttp.ClientError: If the request fails after all retries.
2557
+ Exception: If max retries are exceeded or other errors occur.
2558
+ """
2559
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2560
+
2561
+ async def _async_get_request(
2562
+ self, url: str, headers: Dict[str, str],
2563
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2564
+ """
2565
+ Make an async GET request with exponential backoff retry logic.
2566
+
2567
+ Args:
2568
+ url (str): The URL to send the GET request to.
2569
+ headers (Dict[str, str]): Headers to include in the request.
2570
+ retries (int): Maximum number of retry attempts (default: 3).
2571
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2572
+ Delay will be backoff_factor * (2 ** retry_count).
2573
+
2574
+ Returns:
2575
+ Dict[str, Any]: The parsed JSON response from the server.
2576
+
2577
+ Raises:
2578
+ aiohttp.ClientError: If the request fails after all retries.
2579
+ Exception: If max retries are exceeded or other errors occur.
2580
+ """
2581
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2582
+
2583
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2584
+ """
2585
+ Handle errors from async API responses with detailed error messages.
2586
+
2587
+ Args:
2588
+ response (aiohttp.ClientResponse): The response object from the failed request
2589
+ action (str): Description of the action that was being attempted
2590
+
2591
+ Raises:
2592
+ aiohttp.ClientError: With a detailed error message based on the response status:
2593
+ - 402: Payment Required
2594
+ - 408: Request Timeout
2595
+ - 409: Conflict
2596
+ - 500: Internal Server Error
2597
+ - Other: Unexpected error with status code
2598
+ """
2599
+ try:
2600
+ error_data = await response.json()
2601
+ error_message = error_data.get('error', 'No error message provided.')
2602
+ error_details = error_data.get('details', 'No additional error details provided.')
2603
+ except:
2604
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2605
+
2606
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2607
+
2608
+ raise aiohttp.ClientError(message)
2609
+
2610
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2611
+ """
2612
+ Generate a standardized error message based on HTTP status code for async operations.
2613
+
2614
+ Args:
2615
+ status_code (int): The HTTP status code from the response
2616
+ action (str): Description of the action that was being performed
2617
+ error_message (str): The error message from the API response
2618
+ error_details (str): Additional error details from the API response
2619
+
2620
+ Returns:
2621
+ str: A formatted error message
2622
+ """
2623
+ return self._get_error_message(status_code, action, error_message, error_details)
2624
+
2625
+ async def crawl_url_and_watch(
2626
+ self,
2627
+ url: str,
2628
+ params: Optional[CrawlParams] = None,
2629
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2630
+ """
2631
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2632
+
2633
+ Args:
2634
+ url (str): Target URL to start crawling from
2635
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2636
+ URL Discovery:
2637
+ * includePaths - Patterns of URLs to include
2638
+ * excludePaths - Patterns of URLs to exclude
2639
+ * maxDepth - Maximum crawl depth
2640
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2641
+ * limit - Maximum pages to crawl
2642
+
2643
+ Link Following:
2644
+ * allowBackwardLinks - Follow parent directory links
2645
+ * allowExternalLinks - Follow external domain links
2646
+ * ignoreSitemap - Skip sitemap.xml processing
2647
+
2648
+ Advanced:
2649
+ * scrapeOptions - Page scraping configuration
2650
+ * webhook - Notification webhook settings
2651
+ * deduplicateSimilarURLs - Remove similar URLs
2652
+ * ignoreQueryParameters - Ignore URL parameters
2653
+ * regexOnFullURL - Apply regex to full URLs
2654
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2655
+
2656
+ Returns:
2657
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2658
+
2659
+ Raises:
2660
+ Exception: If crawl job fails to start
2661
+ """
2662
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2663
+ if crawl_response.get('success') and 'id' in crawl_response:
2664
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2665
+ else:
2666
+ raise Exception("Crawl job failed to start")
2667
+
2668
+ async def batch_scrape_urls_and_watch(
2669
+ self,
2670
+ urls: List[str],
2671
+ params: Optional[ScrapeParams] = None,
2672
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2673
+ """
2674
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2675
+
2676
+ Args:
2677
+ urls (List[str]): List of URLs to scrape
2678
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2679
+
2680
+ Content Options:
2681
+ * formats - Content formats to retrieve
2682
+ * includeTags - HTML tags to include
2683
+ * excludeTags - HTML tags to exclude
2684
+ * onlyMainContent - Extract main content only
2685
+
2686
+ Request Options:
2687
+ * headers - Custom HTTP headers
2688
+ * timeout - Request timeout (ms)
2689
+ * mobile - Use mobile user agent
2690
+ * proxy - Proxy type
2691
+
2692
+ Extraction Options:
2693
+ * extract - Content extraction config
2694
+ * jsonOptions - JSON extraction config
2695
+ * actions - Actions to perform
2696
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2697
+
2698
+ Returns:
2699
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2700
+
2701
+ Raises:
2702
+ Exception: If batch scrape job fails to start
2703
+ """
2704
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2705
+ if batch_response.get('success') and 'id' in batch_response:
2706
+ return AsyncCrawlWatcher(batch_response['id'], self)
2707
+ else:
2708
+ raise Exception("Batch scrape job failed to start")
2709
+
2710
+ async def scrape_url(
2711
+ self,
2712
+ url: str,
2713
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2714
+ include_tags: Optional[List[str]] = None,
2715
+ exclude_tags: Optional[List[str]] = None,
2716
+ only_main_content: Optional[bool] = None,
2717
+ wait_for: Optional[int] = None,
2718
+ timeout: Optional[int] = None,
2719
+ location: Optional[LocationConfig] = None,
2720
+ mobile: Optional[bool] = None,
2721
+ skip_tls_verification: Optional[bool] = None,
2722
+ remove_base64_images: Optional[bool] = None,
2723
+ block_ads: Optional[bool] = None,
2724
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2725
+ extract: Optional[JsonConfig] = None,
2726
+ json_options: Optional[JsonConfig] = None,
2727
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
2728
+ """
2729
+ Scrape and extract content from a URL asynchronously.
2730
+
2731
+ Args:
2732
+ url (str): Target URL to scrape
2733
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2734
+ include_tags (Optional[List[str]]): HTML tags to include
2735
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2736
+ only_main_content (Optional[bool]): Extract main content only
2737
+ wait_for (Optional[int]): Wait for a specific element to appear
2738
+ timeout (Optional[int]): Request timeout (ms)
2739
+ location (Optional[LocationConfig]): Location configuration
2740
+ mobile (Optional[bool]): Use mobile user agent
2741
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2742
+ remove_base64_images (Optional[bool]): Remove base64 images
2743
+ block_ads (Optional[bool]): Block ads
2744
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2745
+ extract (Optional[JsonConfig]): Content extraction settings
2746
+ json_options (Optional[JsonConfig]): JSON extraction settings
2747
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2748
+
2749
+ Returns:
2750
+ ScrapeResponse with:
2751
+ * Requested content formats
2752
+ * Page metadata
2753
+ * Extraction results
2754
+ * Success/error status
2755
+
2756
+ Raises:
2757
+ Exception: If scraping fails
2758
+ """
2759
+ headers = self._prepare_headers()
2760
+
2761
+ # Build scrape parameters
2762
+ scrape_params = {
2763
+ 'url': url,
2764
+ 'origin': f"python-sdk@{version}"
2765
+ }
2766
+
2767
+ # Add optional parameters if provided and not None
2768
+ if formats:
2769
+ scrape_params['formats'] = formats
2770
+ if include_tags:
2771
+ scrape_params['includeTags'] = include_tags
2772
+ if exclude_tags:
2773
+ scrape_params['excludeTags'] = exclude_tags
2774
+ if only_main_content is not None:
2775
+ scrape_params['onlyMainContent'] = only_main_content
2776
+ if wait_for:
2777
+ scrape_params['waitFor'] = wait_for
2778
+ if timeout:
2779
+ scrape_params['timeout'] = timeout
2780
+ if location:
2781
+ scrape_params['location'] = location.dict(exclude_none=True)
2782
+ if mobile is not None:
2783
+ scrape_params['mobile'] = mobile
2784
+ if skip_tls_verification is not None:
2785
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2786
+ if remove_base64_images is not None:
2787
+ scrape_params['removeBase64Images'] = remove_base64_images
2788
+ if block_ads is not None:
2789
+ scrape_params['blockAds'] = block_ads
2790
+ if proxy:
2791
+ scrape_params['proxy'] = proxy
2792
+ if extract:
2793
+ extract_dict = extract.dict(exclude_none=True)
2794
+ if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
2795
+ extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
2796
+ scrape_params['extract'] = extract_dict
2797
+ if json_options:
2798
+ json_options_dict = json_options.dict(exclude_none=True)
2799
+ if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
2800
+ json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
2801
+ scrape_params['jsonOptions'] = json_options_dict
2802
+ if actions:
2803
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2804
+
2805
+ # Make async request
2806
+ endpoint = f'/v1/scrape'
2807
+ response = await self._async_post_request(
2808
+ f'{self.api_url}{endpoint}',
2809
+ scrape_params,
2810
+ headers
2811
+ )
2812
+
2813
+ if response.get('success') and 'data' in response:
2814
+ return ScrapeResponse(**response['data'])
2815
+ elif "error" in response:
2816
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2817
+ else:
2818
+ # Use the response content directly if possible, otherwise a generic message
2819
+ error_content = response.get('error', str(response))
2820
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
2821
+
2822
+ async def batch_scrape_urls(
2823
+ self,
2824
+ urls: List[str],
2825
+ *,
2826
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2827
+ headers: Optional[Dict[str, str]] = None,
2828
+ include_tags: Optional[List[str]] = None,
2829
+ exclude_tags: Optional[List[str]] = None,
2830
+ only_main_content: Optional[bool] = None,
2831
+ wait_for: Optional[int] = None,
2832
+ timeout: Optional[int] = None,
2833
+ location: Optional[LocationConfig] = None,
2834
+ mobile: Optional[bool] = None,
2835
+ skip_tls_verification: Optional[bool] = None,
2836
+ remove_base64_images: Optional[bool] = None,
2837
+ block_ads: Optional[bool] = None,
2838
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2839
+ extract: Optional[JsonConfig] = None,
2840
+ json_options: Optional[JsonConfig] = None,
2841
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2842
+ agent: Optional[AgentOptions] = None,
2843
+ poll_interval: Optional[int] = 2,
2844
+ idempotency_key: Optional[str] = None,
2845
+ **kwargs
2846
+ ) -> BatchScrapeStatusResponse:
2847
+ """
2848
+ Asynchronously scrape multiple URLs and monitor until completion.
2849
+
2850
+ Args:
2851
+ urls (List[str]): URLs to scrape
2852
+ formats (Optional[List[Literal]]): Content formats to retrieve
2853
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2854
+ include_tags (Optional[List[str]]): HTML tags to include
2855
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2856
+ only_main_content (Optional[bool]): Extract main content only
2857
+ wait_for (Optional[int]): Wait time in milliseconds
2858
+ timeout (Optional[int]): Request timeout in milliseconds
2859
+ location (Optional[LocationConfig]): Location configuration
2860
+ mobile (Optional[bool]): Use mobile user agent
2861
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2862
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2863
+ block_ads (Optional[bool]): Block advertisements
2864
+ proxy (Optional[Literal]): Proxy type to use
2865
+ extract (Optional[JsonConfig]): Content extraction config
2866
+ json_options (Optional[JsonConfig]): JSON extraction config
2867
+ actions (Optional[List[Union]]): Actions to perform
2868
+ agent (Optional[AgentOptions]): Agent configuration
2869
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
2870
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2871
+ **kwargs: Additional parameters to pass to the API
2872
+
2873
+ Returns:
2874
+ BatchScrapeStatusResponse with:
2875
+ * Scraping status and progress
2876
+ * Scraped content for each URL
2877
+ * Success/error information
2878
+
2879
+ Raises:
2880
+ Exception: If batch scrape fails
2881
+ """
2882
+ scrape_params = {}
2883
+
2884
+ # Add individual parameters
2885
+ if formats is not None:
2886
+ scrape_params['formats'] = formats
2887
+ if headers is not None:
2888
+ scrape_params['headers'] = headers
2889
+ if include_tags is not None:
2890
+ scrape_params['includeTags'] = include_tags
2891
+ if exclude_tags is not None:
2892
+ scrape_params['excludeTags'] = exclude_tags
2893
+ if only_main_content is not None:
2894
+ scrape_params['onlyMainContent'] = only_main_content
2895
+ if wait_for is not None:
2896
+ scrape_params['waitFor'] = wait_for
2897
+ if timeout is not None:
2898
+ scrape_params['timeout'] = timeout
2899
+ if location is not None:
2900
+ scrape_params['location'] = location.dict(exclude_none=True)
2901
+ if mobile is not None:
2902
+ scrape_params['mobile'] = mobile
2903
+ if skip_tls_verification is not None:
2904
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2905
+ if remove_base64_images is not None:
2906
+ scrape_params['removeBase64Images'] = remove_base64_images
2907
+ if block_ads is not None:
2908
+ scrape_params['blockAds'] = block_ads
2909
+ if proxy is not None:
2910
+ scrape_params['proxy'] = proxy
2911
+ if extract is not None:
2912
+ if hasattr(extract.schema, 'schema'):
2913
+ extract.schema = extract.schema.schema()
2914
+ scrape_params['extract'] = extract.dict(exclude_none=True)
2915
+ if json_options is not None:
2916
+ if hasattr(json_options.schema, 'schema'):
2917
+ json_options.schema = json_options.schema.schema()
2918
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
2919
+ if actions is not None:
2920
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2921
+ if agent is not None:
2922
+ scrape_params['agent'] = agent.dict(exclude_none=True)
2923
+
2924
+ # Add any additional kwargs
2925
+ scrape_params.update(kwargs)
2926
+
2927
+ # Create final params object
2928
+ final_params = ScrapeParams(**scrape_params)
2929
+ params_dict = final_params.dict(exclude_none=True)
2930
+ params_dict['urls'] = urls
2931
+ params_dict['origin'] = f"python-sdk@{version}"
2932
+
2933
+ # Make request
2934
+ headers = self._prepare_headers(idempotency_key)
2935
+ response = await self._async_post_request(
2936
+ f'{self.api_url}/v1/batch/scrape',
2937
+ params_dict,
2938
+ headers
2939
+ )
2940
+
2941
+ if response.get('success'):
2942
+ try:
2943
+ id = response.get('id')
2944
+ except:
2945
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2946
+ return self._monitor_job_status(id, headers, poll_interval)
2947
+ else:
2948
+ self._handle_error(response, 'start batch scrape job')
2949
+
2950
+
2951
+ async def async_batch_scrape_urls(
2952
+ self,
2953
+ urls: List[str],
2954
+ *,
2955
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2956
+ headers: Optional[Dict[str, str]] = None,
2957
+ include_tags: Optional[List[str]] = None,
2958
+ exclude_tags: Optional[List[str]] = None,
2959
+ only_main_content: Optional[bool] = None,
2960
+ wait_for: Optional[int] = None,
2961
+ timeout: Optional[int] = None,
2962
+ location: Optional[LocationConfig] = None,
2963
+ mobile: Optional[bool] = None,
2964
+ skip_tls_verification: Optional[bool] = None,
2965
+ remove_base64_images: Optional[bool] = None,
2966
+ block_ads: Optional[bool] = None,
2967
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2968
+ extract: Optional[JsonConfig] = None,
2969
+ json_options: Optional[JsonConfig] = None,
2970
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2971
+ agent: Optional[AgentOptions] = None,
2972
+ idempotency_key: Optional[str] = None,
2973
+ **kwargs
2974
+ ) -> BatchScrapeResponse:
2975
+ """
2976
+ Initiate a batch scrape job asynchronously.
2977
+
2978
+ Args:
2979
+ urls (List[str]): URLs to scrape
2980
+ formats (Optional[List[Literal]]): Content formats to retrieve
2981
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2982
+ include_tags (Optional[List[str]]): HTML tags to include
2983
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2984
+ only_main_content (Optional[bool]): Extract main content only
2985
+ wait_for (Optional[int]): Wait time in milliseconds
2986
+ timeout (Optional[int]): Request timeout in milliseconds
2987
+ location (Optional[LocationConfig]): Location configuration
2988
+ mobile (Optional[bool]): Use mobile user agent
2989
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2990
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2991
+ block_ads (Optional[bool]): Block advertisements
2992
+ proxy (Optional[Literal]): Proxy type to use
2993
+ extract (Optional[JsonConfig]): Content extraction config
2994
+ json_options (Optional[JsonConfig]): JSON extraction config
2995
+ actions (Optional[List[Union]]): Actions to perform
2996
+ agent (Optional[AgentOptions]): Agent configuration
2997
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2998
+ **kwargs: Additional parameters to pass to the API
2999
+
3000
+ Returns:
3001
+ BatchScrapeResponse with:
3002
+ * success - Whether job started successfully
3003
+ * id - Unique identifier for the job
3004
+ * url - Status check URL
3005
+ * error - Error message if start failed
3006
+
3007
+ Raises:
3008
+ Exception: If job initiation fails
3009
+ """
3010
+ scrape_params = {}
3011
+
3012
+ # Add individual parameters
3013
+ if formats is not None:
3014
+ scrape_params['formats'] = formats
3015
+ if headers is not None:
3016
+ scrape_params['headers'] = headers
3017
+ if include_tags is not None:
3018
+ scrape_params['includeTags'] = include_tags
3019
+ if exclude_tags is not None:
3020
+ scrape_params['excludeTags'] = exclude_tags
3021
+ if only_main_content is not None:
3022
+ scrape_params['onlyMainContent'] = only_main_content
3023
+ if wait_for is not None:
3024
+ scrape_params['waitFor'] = wait_for
3025
+ if timeout is not None:
3026
+ scrape_params['timeout'] = timeout
3027
+ if location is not None:
3028
+ scrape_params['location'] = location.dict(exclude_none=True)
3029
+ if mobile is not None:
3030
+ scrape_params['mobile'] = mobile
3031
+ if skip_tls_verification is not None:
3032
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3033
+ if remove_base64_images is not None:
3034
+ scrape_params['removeBase64Images'] = remove_base64_images
3035
+ if block_ads is not None:
3036
+ scrape_params['blockAds'] = block_ads
3037
+ if proxy is not None:
3038
+ scrape_params['proxy'] = proxy
3039
+ if extract is not None:
3040
+ if hasattr(extract.schema, 'schema'):
3041
+ extract.schema = extract.schema.schema()
3042
+ scrape_params['extract'] = extract.dict(exclude_none=True)
3043
+ if json_options is not None:
3044
+ if hasattr(json_options.schema, 'schema'):
3045
+ json_options.schema = json_options.schema.schema()
3046
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
3047
+ if actions is not None:
3048
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3049
+ if agent is not None:
3050
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3051
+
3052
+ # Add any additional kwargs
3053
+ scrape_params.update(kwargs)
3054
+
3055
+ # Create final params object
3056
+ final_params = ScrapeParams(**scrape_params)
3057
+ params_dict = final_params.dict(exclude_none=True)
3058
+ params_dict['urls'] = urls
3059
+ params_dict['origin'] = f"python-sdk@{version}"
3060
+
3061
+ # Make request
3062
+ headers = self._prepare_headers(idempotency_key)
3063
+ response = await self._async_post_request(
3064
+ f'{self.api_url}/v1/batch/scrape',
3065
+ params_dict,
3066
+ headers
3067
+ )
3068
+
3069
+ if response.get('status_code') == 200:
3070
+ try:
3071
+ return BatchScrapeResponse(**response.json())
3072
+ except:
3073
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3074
+ else:
3075
+ self._handle_error(response, 'start batch scrape job')
3076
+
3077
+ async def crawl_url(
3078
+ self,
3079
+ url: str,
3080
+ *,
3081
+ include_paths: Optional[List[str]] = None,
3082
+ exclude_paths: Optional[List[str]] = None,
3083
+ max_depth: Optional[int] = None,
3084
+ max_discovery_depth: Optional[int] = None,
3085
+ limit: Optional[int] = None,
3086
+ allow_backward_links: Optional[bool] = None,
3087
+ allow_external_links: Optional[bool] = None,
3088
+ ignore_sitemap: Optional[bool] = None,
3089
+ scrape_options: Optional[ScrapeOptions] = None,
3090
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3091
+ deduplicate_similar_urls: Optional[bool] = None,
3092
+ ignore_query_parameters: Optional[bool] = None,
3093
+ regex_on_full_url: Optional[bool] = None,
3094
+ poll_interval: Optional[int] = 2,
3095
+ idempotency_key: Optional[str] = None,
3096
+ **kwargs
3097
+ ) -> CrawlStatusResponse:
3098
+ """
3099
+ Crawl a website starting from a URL.
3100
+
3101
+ Args:
3102
+ url (str): Target URL to start crawling from
3103
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3104
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3105
+ max_depth (Optional[int]): Maximum crawl depth
3106
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3107
+ limit (Optional[int]): Maximum pages to crawl
3108
+ allow_backward_links (Optional[bool]): Follow parent directory links
3109
+ allow_external_links (Optional[bool]): Follow external domain links
3110
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3111
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3112
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3113
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3114
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3115
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3116
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3117
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3118
+ **kwargs: Additional parameters to pass to the API
3119
+
3120
+ Returns:
3121
+ CrawlStatusResponse with:
3122
+ * Crawling status and progress
3123
+ * Crawled page contents
3124
+ * Success/error information
3125
+
3126
+ Raises:
3127
+ Exception: If crawl fails
3128
+ """
3129
+ crawl_params = {}
3130
+
3131
+ # Add individual parameters
3132
+ if include_paths is not None:
3133
+ crawl_params['includePaths'] = include_paths
3134
+ if exclude_paths is not None:
3135
+ crawl_params['excludePaths'] = exclude_paths
3136
+ if max_depth is not None:
3137
+ crawl_params['maxDepth'] = max_depth
3138
+ if max_discovery_depth is not None:
3139
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3140
+ if limit is not None:
3141
+ crawl_params['limit'] = limit
3142
+ if allow_backward_links is not None:
3143
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3144
+ if allow_external_links is not None:
3145
+ crawl_params['allowExternalLinks'] = allow_external_links
3146
+ if ignore_sitemap is not None:
3147
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3148
+ if scrape_options is not None:
3149
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3150
+ if webhook is not None:
3151
+ crawl_params['webhook'] = webhook
3152
+ if deduplicate_similar_urls is not None:
3153
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3154
+ if ignore_query_parameters is not None:
3155
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3156
+ if regex_on_full_url is not None:
3157
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3158
+
3159
+ # Add any additional kwargs
3160
+ crawl_params.update(kwargs)
3161
+
3162
+ # Create final params object
3163
+ final_params = CrawlParams(**crawl_params)
3164
+ params_dict = final_params.dict(exclude_none=True)
3165
+ params_dict['url'] = url
3166
+ params_dict['origin'] = f"python-sdk@{version}"
3167
+ # Make request
3168
+ headers = self._prepare_headers(idempotency_key)
3169
+ response = await self._async_post_request(
3170
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3171
+
3172
+ if response.get('success'):
3173
+ try:
3174
+ id = response.get('id')
3175
+ except:
3176
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3177
+ return self._monitor_job_status(id, headers, poll_interval)
3178
+ else:
3179
+ self._handle_error(response, 'start crawl job')
3180
+
3181
+
3182
+ async def async_crawl_url(
3183
+ self,
3184
+ url: str,
3185
+ *,
3186
+ include_paths: Optional[List[str]] = None,
3187
+ exclude_paths: Optional[List[str]] = None,
3188
+ max_depth: Optional[int] = None,
3189
+ max_discovery_depth: Optional[int] = None,
3190
+ limit: Optional[int] = None,
3191
+ allow_backward_links: Optional[bool] = None,
3192
+ allow_external_links: Optional[bool] = None,
3193
+ ignore_sitemap: Optional[bool] = None,
3194
+ scrape_options: Optional[ScrapeOptions] = None,
3195
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3196
+ deduplicate_similar_urls: Optional[bool] = None,
3197
+ ignore_query_parameters: Optional[bool] = None,
3198
+ regex_on_full_url: Optional[bool] = None,
3199
+ poll_interval: Optional[int] = 2,
3200
+ idempotency_key: Optional[str] = None,
3201
+ **kwargs
3202
+ ) -> CrawlResponse:
3203
+ """
3204
+ Start an asynchronous crawl job.
3205
+
3206
+ Args:
3207
+ url (str): Target URL to start crawling from
3208
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3209
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3210
+ max_depth (Optional[int]): Maximum crawl depth
3211
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3212
+ limit (Optional[int]): Maximum pages to crawl
3213
+ allow_backward_links (Optional[bool]): Follow parent directory links
3214
+ allow_external_links (Optional[bool]): Follow external domain links
3215
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3216
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3217
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3218
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3219
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3220
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3221
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3222
+ **kwargs: Additional parameters to pass to the API
3223
+
3224
+ Returns:
3225
+ CrawlResponse with:
3226
+ * success - Whether crawl started successfully
3227
+ * id - Unique identifier for the crawl job
3228
+ * url - Status check URL for the crawl
3229
+ * error - Error message if start failed
3230
+
3231
+ Raises:
3232
+ Exception: If crawl initiation fails
3233
+ """
3234
+ crawl_params = {}
3235
+
3236
+ # Add individual parameters
3237
+ if include_paths is not None:
3238
+ crawl_params['includePaths'] = include_paths
3239
+ if exclude_paths is not None:
3240
+ crawl_params['excludePaths'] = exclude_paths
3241
+ if max_depth is not None:
3242
+ crawl_params['maxDepth'] = max_depth
3243
+ if max_discovery_depth is not None:
3244
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3245
+ if limit is not None:
3246
+ crawl_params['limit'] = limit
3247
+ if allow_backward_links is not None:
3248
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3249
+ if allow_external_links is not None:
3250
+ crawl_params['allowExternalLinks'] = allow_external_links
3251
+ if ignore_sitemap is not None:
3252
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3253
+ if scrape_options is not None:
3254
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3255
+ if webhook is not None:
3256
+ crawl_params['webhook'] = webhook
3257
+ if deduplicate_similar_urls is not None:
3258
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3259
+ if ignore_query_parameters is not None:
3260
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3261
+ if regex_on_full_url is not None:
3262
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3263
+
3264
+ # Add any additional kwargs
3265
+ crawl_params.update(kwargs)
3266
+
3267
+ # Create final params object
3268
+ final_params = CrawlParams(**crawl_params)
3269
+ params_dict = final_params.dict(exclude_none=True)
3270
+ params_dict['url'] = url
3271
+ params_dict['origin'] = f"python-sdk@{version}"
3272
+
3273
+ # Make request
3274
+ headers = self._prepare_headers(idempotency_key)
3275
+ response = await self._async_post_request(
3276
+ f'{self.api_url}/v1/crawl',
3277
+ params_dict,
3278
+ headers
3279
+ )
3280
+
3281
+ if response.get('success'):
3282
+ try:
3283
+ return CrawlResponse(**response)
3284
+ except:
3285
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3286
+ else:
3287
+ self._handle_error(response, 'start crawl job')
3288
+
3289
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3290
+ """
3291
+ Check the status and results of an asynchronous crawl job.
3292
+
3293
+ Args:
3294
+ id (str): Unique identifier for the crawl job
3295
+
3296
+ Returns:
3297
+ CrawlStatusResponse containing:
3298
+ Status Information:
3299
+ * status - Current state (scraping/completed/failed/cancelled)
3300
+ * completed - Number of pages crawled
3301
+ * total - Total pages to crawl
3302
+ * creditsUsed - API credits consumed
3303
+ * expiresAt - Data expiration timestamp
3304
+
3305
+ Results:
3306
+ * data - List of crawled documents
3307
+ * next - URL for next page of results (if paginated)
3308
+ * success - Whether status check succeeded
3309
+ * error - Error message if failed
3310
+
3311
+ Raises:
3312
+ Exception: If status check fails
3313
+ """
3314
+ headers = self._prepare_headers()
3315
+ endpoint = f'/v1/crawl/{id}'
3316
+
3317
+ status_data = await self._async_get_request(
3318
+ f'{self.api_url}{endpoint}',
3319
+ headers
3320
+ )
3321
+
3322
+ if status_data.get('status') == 'completed':
3323
+ if 'data' in status_data:
3324
+ data = status_data['data']
3325
+ while 'next' in status_data:
3326
+ if len(status_data['data']) == 0:
3327
+ break
3328
+ next_url = status_data.get('next')
3329
+ if not next_url:
3330
+ logger.warning("Expected 'next' URL is missing.")
3331
+ break
3332
+ next_data = await self._async_get_request(next_url, headers)
3333
+ data.extend(next_data.get('data', []))
3334
+ status_data = next_data
3335
+ status_data['data'] = data
3336
+ # Create CrawlStatusResponse object from status data
3337
+ response = CrawlStatusResponse(
3338
+ status=status_data.get('status'),
3339
+ total=status_data.get('total'),
3340
+ completed=status_data.get('completed'),
3341
+ creditsUsed=status_data.get('creditsUsed'),
3342
+ expiresAt=status_data.get('expiresAt'),
3343
+ data=status_data.get('data'),
3344
+ success=False if 'error' in status_data else True
3345
+ )
3346
+
3347
+ if 'error' in status_data:
3348
+ response.error = status_data.get('error')
3349
+
3350
+ if 'next' in status_data:
3351
+ response.next = status_data.get('next')
3352
+
3353
+ return response
3354
+
3355
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3356
+ """
3357
+ Monitor the status of an asynchronous job until completion.
3358
+
3359
+ Args:
3360
+ id (str): The ID of the job to monitor
3361
+ headers (Dict[str, str]): Headers to include in status check requests
3362
+ poll_interval (int): Seconds between status checks (default: 2)
3363
+
3364
+ Returns:
3365
+ CrawlStatusResponse: The job results if completed successfully
3366
+
3367
+ Raises:
3368
+ Exception: If the job fails or an error occurs during status checks
3369
+ """
3370
+ while True:
3371
+ status_data = await self._async_get_request(
3372
+ f'{self.api_url}/v1/crawl/{id}',
3373
+ headers
3374
+ )
3375
+
3376
+ if status_data.get('status') == 'completed':
3377
+ if 'data' in status_data:
3378
+ data = status_data['data']
3379
+ while 'next' in status_data:
3380
+ if len(status_data['data']) == 0:
3381
+ break
3382
+ next_url = status_data.get('next')
3383
+ if not next_url:
3384
+ logger.warning("Expected 'next' URL is missing.")
3385
+ break
3386
+ next_data = await self._async_get_request(next_url, headers)
3387
+ data.extend(next_data.get('data', []))
3388
+ status_data = next_data
3389
+ status_data['data'] = data
3390
+ return status_data
3391
+ else:
3392
+ raise Exception('Job completed but no data was returned')
3393
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3394
+ await asyncio.sleep(max(poll_interval, 2))
3395
+ else:
3396
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3397
+
3398
+ async def map_url(
3399
+ self,
3400
+ url: str,
3401
+ *,
3402
+ search: Optional[str] = None,
3403
+ ignore_sitemap: Optional[bool] = None,
3404
+ include_subdomains: Optional[bool] = None,
3405
+ sitemap_only: Optional[bool] = None,
3406
+ limit: Optional[int] = None,
3407
+ timeout: Optional[int] = None,
3408
+ params: Optional[MapParams] = None) -> MapResponse:
3409
+ """
3410
+ Asynchronously map and discover links from a URL.
3411
+
3412
+ Args:
3413
+ url (str): Target URL to map
3414
+ params (Optional[MapParams]): See MapParams model:
3415
+ Discovery Options:
3416
+ * search - Filter pattern for URLs
3417
+ * ignoreSitemap - Skip sitemap.xml
3418
+ * includeSubdomains - Include subdomain links
3419
+ * sitemapOnly - Only use sitemap.xml
3420
+
3421
+ Limits:
3422
+ * limit - Max URLs to return
3423
+ * timeout - Request timeout (ms)
3424
+
3425
+ Returns:
3426
+ MapResponse with:
3427
+ * Discovered URLs
3428
+ * Success/error status
3429
+
3430
+ Raises:
3431
+ Exception: If mapping fails
3432
+ """
3433
+ map_params = {}
3434
+ if params:
3435
+ map_params.update(params.dict(exclude_none=True))
3436
+
3437
+ # Add individual parameters
3438
+ if search is not None:
3439
+ map_params['search'] = search
3440
+ if ignore_sitemap is not None:
3441
+ map_params['ignoreSitemap'] = ignore_sitemap
3442
+ if include_subdomains is not None:
3443
+ map_params['includeSubdomains'] = include_subdomains
3444
+ if sitemap_only is not None:
3445
+ map_params['sitemapOnly'] = sitemap_only
3446
+ if limit is not None:
3447
+ map_params['limit'] = limit
3448
+ if timeout is not None:
3449
+ map_params['timeout'] = timeout
3450
+
3451
+ # Create final params object
3452
+ final_params = MapParams(**map_params)
3453
+ params_dict = final_params.dict(exclude_none=True)
3454
+ params_dict['url'] = url
3455
+ params_dict['origin'] = f"python-sdk@{version}"
3456
+
3457
+ # Make request
3458
+ endpoint = f'/v1/map'
3459
+ response = await self._async_post_request(
3460
+ f'{self.api_url}{endpoint}',
3461
+ params_dict,
3462
+ headers={"Authorization": f"Bearer {self.api_key}"}
3463
+ )
3464
+
3465
+ if response.get('success') and 'links' in response:
3466
+ return MapResponse(**response)
3467
+ elif 'error' in response:
3468
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3469
+ else:
3470
+ raise Exception(f'Failed to map URL. Error: {response}')
3471
+
3472
+ async def extract(
3473
+ self,
3474
+ urls: Optional[List[str]] = None,
3475
+ *,
3476
+ prompt: Optional[str] = None,
3477
+ schema: Optional[Any] = None,
3478
+ system_prompt: Optional[str] = None,
3479
+ allow_external_links: Optional[bool] = False,
3480
+ enable_web_search: Optional[bool] = False,
3481
+ show_sources: Optional[bool] = False,
3482
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3483
+
3484
+ """
3485
+ Asynchronously extract structured information from URLs.
3486
+
3487
+ Args:
3488
+ urls (Optional[List[str]]): URLs to extract from
3489
+ prompt (Optional[str]): Custom extraction prompt
3490
+ schema (Optional[Any]): JSON schema/Pydantic model
3491
+ system_prompt (Optional[str]): System context
3492
+ allow_external_links (Optional[bool]): Follow external links
3493
+ enable_web_search (Optional[bool]): Enable web search
3494
+ show_sources (Optional[bool]): Include source URLs
3495
+ agent (Optional[Dict[str, Any]]): Agent configuration
3496
+
3497
+ Returns:
3498
+ ExtractResponse with:
3499
+ * Structured data matching schema
3500
+ * Source information if requested
3501
+ * Success/error status
3502
+
3503
+ Raises:
3504
+ ValueError: If prompt/schema missing or extraction fails
3505
+ """
3506
+ headers = self._prepare_headers()
3507
+
3508
+ if not prompt and not schema:
3509
+ raise ValueError("Either prompt or schema is required")
3510
+
3511
+ if not urls and not prompt:
3512
+ raise ValueError("Either urls or prompt is required")
3513
+
3514
+ if schema:
3515
+ if hasattr(schema, 'model_json_schema'):
3516
+ # Convert Pydantic model to JSON schema
3517
+ schema = schema.model_json_schema()
3518
+ # Otherwise assume it's already a JSON schema dict
3519
+
3520
+ request_data = {
3521
+ 'urls': urls or [],
3522
+ 'allowExternalLinks': allow_external_links,
3523
+ 'enableWebSearch': enable_web_search,
3524
+ 'showSources': show_sources,
3525
+ 'schema': schema,
3526
+ 'origin': f'python-sdk@{get_version()}'
3527
+ }
3528
+
3529
+ # Only add prompt and systemPrompt if they exist
3530
+ if prompt:
3531
+ request_data['prompt'] = prompt
3532
+ if system_prompt:
3533
+ request_data['systemPrompt'] = system_prompt
3534
+
3535
+ if agent:
3536
+ request_data['agent'] = agent
3537
+
3538
+ response = await self._async_post_request(
3539
+ f'{self.api_url}/v1/extract',
3540
+ request_data,
3541
+ headers
3542
+ )
3543
+
3544
+ if response.get('success'):
3545
+ job_id = response.get('id')
3546
+ if not job_id:
3547
+ raise Exception('Job ID not returned from extract request.')
3548
+
3549
+ while True:
3550
+ status_data = await self._async_get_request(
3551
+ f'{self.api_url}/v1/extract/{job_id}',
3552
+ headers
3553
+ )
3554
+
3555
+ if status_data['status'] == 'completed':
3556
+ return ExtractResponse(**status_data)
3557
+ elif status_data['status'] in ['failed', 'cancelled']:
3558
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3559
+
3560
+ await asyncio.sleep(2)
3561
+ else:
3562
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3563
+
3564
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3565
+ """
3566
+ Check the status of an asynchronous batch scrape job.
3567
+
3568
+ Args:
3569
+ id (str): The ID of the batch scrape job
3570
+
3571
+ Returns:
3572
+ BatchScrapeStatusResponse containing:
3573
+ Status Information:
3574
+ * status - Current state (scraping/completed/failed/cancelled)
3575
+ * completed - Number of URLs scraped
3576
+ * total - Total URLs to scrape
3577
+ * creditsUsed - API credits consumed
3578
+ * expiresAt - Data expiration timestamp
3579
+
3580
+ Results:
3581
+ * data - List of scraped documents
3582
+ * next - URL for next page of results (if paginated)
3583
+ * success - Whether status check succeeded
3584
+ * error - Error message if failed
3585
+
3586
+ Raises:
3587
+ Exception: If status check fails
3588
+ """
3589
+ headers = self._prepare_headers()
3590
+ endpoint = f'/v1/batch/scrape/{id}'
3591
+
3592
+ status_data = await self._async_get_request(
3593
+ f'{self.api_url}{endpoint}',
3594
+ headers
3595
+ )
3596
+
3597
+ if status_data['status'] == 'completed':
3598
+ if 'data' in status_data:
3599
+ data = status_data['data']
3600
+ while 'next' in status_data:
3601
+ if len(status_data['data']) == 0:
3602
+ break
3603
+ next_url = status_data.get('next')
3604
+ if not next_url:
3605
+ logger.warning("Expected 'next' URL is missing.")
3606
+ break
3607
+ next_data = await self._async_get_request(next_url, headers)
3608
+ data.extend(next_data.get('data', []))
3609
+ status_data = next_data
3610
+ status_data['data'] = data
3611
+
3612
+ response = BatchScrapeStatusResponse(
3613
+ status=status_data.get('status'),
3614
+ total=status_data.get('total'),
3615
+ completed=status_data.get('completed'),
3616
+ creditsUsed=status_data.get('creditsUsed'),
3617
+ expiresAt=status_data.get('expiresAt'),
3618
+ data=status_data.get('data')
3619
+ )
3620
+
3621
+ if 'error' in status_data:
3622
+ response['error'] = status_data['error']
3623
+
3624
+ if 'next' in status_data:
3625
+ response['next'] = status_data['next']
3626
+
3627
+ return {
3628
+ 'success': False if 'error' in status_data else True,
3629
+ **response
3630
+ }
3631
+
3632
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3633
+ """
3634
+ Get information about errors from an asynchronous batch scrape job.
3635
+
3636
+ Args:
3637
+ id (str): The ID of the batch scrape job
3638
+
3639
+ Returns:
3640
+ CrawlErrorsResponse containing:
3641
+ errors (List[Dict[str, str]]): List of errors with fields:
3642
+ * id (str): Error ID
3643
+ * timestamp (str): When the error occurred
3644
+ * url (str): URL that caused the error
3645
+ * error (str): Error message
3646
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3647
+
3648
+ Raises:
3649
+ Exception: If error check fails
3650
+ """
3651
+ headers = self._prepare_headers()
3652
+ return await self._async_get_request(
3653
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3654
+ headers
3655
+ )
3656
+
3657
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3658
+ """
3659
+ Get information about errors from an asynchronous crawl job.
3660
+
3661
+ Args:
3662
+ id (str): The ID of the crawl job
3663
+
3664
+ Returns:
3665
+ CrawlErrorsResponse containing:
3666
+ * errors (List[Dict[str, str]]): List of errors with fields:
3667
+ - id (str): Error ID
3668
+ - timestamp (str): When the error occurred
3669
+ - url (str): URL that caused the error
3670
+ - error (str): Error message
3671
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3672
+
3673
+ Raises:
3674
+ Exception: If error check fails
3675
+ """
3676
+ headers = self._prepare_headers()
3677
+ return await self._async_get_request(
3678
+ f'{self.api_url}/v1/crawl/{id}/errors',
3679
+ headers
3680
+ )
3681
+
3682
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3683
+ """
3684
+ Cancel an asynchronous crawl job.
3685
+
3686
+ Args:
3687
+ id (str): The ID of the crawl job to cancel
3688
+
3689
+ Returns:
3690
+ Dict[str, Any] containing:
3691
+ * success (bool): Whether cancellation was successful
3692
+ * error (str, optional): Error message if cancellation failed
3693
+
3694
+ Raises:
3695
+ Exception: If cancellation fails
3696
+ """
3697
+ headers = self._prepare_headers()
3698
+ async with aiohttp.ClientSession() as session:
3699
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3700
+ return await response.json()
3701
+
3702
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3703
+ """
3704
+ Check the status of an asynchronous extraction job.
3705
+
3706
+ Args:
3707
+ job_id (str): The ID of the extraction job
3708
+
3709
+ Returns:
3710
+ ExtractResponse[Any] with:
3711
+ * success (bool): Whether request succeeded
3712
+ * data (Optional[Any]): Extracted data matching schema
3713
+ * error (Optional[str]): Error message if any
3714
+ * warning (Optional[str]): Warning message if any
3715
+ * sources (Optional[List[str]]): Source URLs if requested
3716
+
3717
+ Raises:
3718
+ ValueError: If status check fails
3719
+ """
3720
+ headers = self._prepare_headers()
3721
+ try:
3722
+ return await self._async_get_request(
3723
+ f'{self.api_url}/v1/extract/{job_id}',
3724
+ headers
3725
+ )
3726
+ except Exception as e:
3727
+ raise ValueError(str(e))
3728
+
3729
+ async def async_extract(
3730
+ self,
3731
+ urls: Optional[List[str]] = None,
3732
+ *,
3733
+ prompt: Optional[str] = None,
3734
+ schema: Optional[Any] = None,
3735
+ system_prompt: Optional[str] = None,
3736
+ allow_external_links: Optional[bool] = False,
3737
+ enable_web_search: Optional[bool] = False,
3738
+ show_sources: Optional[bool] = False,
3739
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3740
+ """
3741
+ Initiate an asynchronous extraction job without waiting for completion.
3742
+
3743
+ Args:
3744
+ urls (Optional[List[str]]): URLs to extract from
3745
+ prompt (Optional[str]): Custom extraction prompt
3746
+ schema (Optional[Any]): JSON schema/Pydantic model
3747
+ system_prompt (Optional[str]): System context
3748
+ allow_external_links (Optional[bool]): Follow external links
3749
+ enable_web_search (Optional[bool]): Enable web search
3750
+ show_sources (Optional[bool]): Include source URLs
3751
+ agent (Optional[Dict[str, Any]]): Agent configuration
3752
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3753
+
3754
+ Returns:
3755
+ ExtractResponse[Any] with:
3756
+ * success (bool): Whether request succeeded
3757
+ * data (Optional[Any]): Extracted data matching schema
3758
+ * error (Optional[str]): Error message if any
3759
+
3760
+ Raises:
3761
+ ValueError: If job initiation fails
3762
+ """
3763
+ headers = self._prepare_headers()
3764
+
3765
+ if not prompt and not schema:
3766
+ raise ValueError("Either prompt or schema is required")
3767
+
3768
+ if not urls and not prompt:
3769
+ raise ValueError("Either urls or prompt is required")
3770
+
3771
+ if schema:
3772
+ if hasattr(schema, 'model_json_schema'):
3773
+ schema = schema.model_json_schema()
3774
+
3775
+ request_data = ExtractResponse(
3776
+ urls=urls or [],
3777
+ allowExternalLinks=allow_external_links,
3778
+ enableWebSearch=enable_web_search,
3779
+ showSources=show_sources,
3780
+ schema=schema,
3781
+ origin=f'python-sdk@{version}'
3782
+ )
3783
+
3784
+ if prompt:
3785
+ request_data['prompt'] = prompt
3786
+ if system_prompt:
3787
+ request_data['systemPrompt'] = system_prompt
3788
+ if agent:
3789
+ request_data['agent'] = agent
3790
+
3791
+ try:
3792
+ return await self._async_post_request(
3793
+ f'{self.api_url}/v1/extract',
3794
+ request_data,
3795
+ headers
3796
+ )
3797
+ except Exception as e:
3798
+ raise ValueError(str(e))
3799
+
3800
+ async def generate_llms_text(
3801
+ self,
3802
+ url: str,
3803
+ *,
3804
+ max_urls: Optional[int] = None,
3805
+ show_full_text: Optional[bool] = None,
3806
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3807
+ """
3808
+ Generate LLMs.txt for a given URL and monitor until completion.
3809
+
3810
+ Args:
3811
+ url (str): Target URL to generate LLMs.txt from
3812
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3813
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3814
+ experimental_stream (Optional[bool]): Enable experimental streaming
3815
+
3816
+ Returns:
3817
+ GenerateLLMsTextStatusResponse containing:
3818
+ * success (bool): Whether generation completed successfully
3819
+ * status (str): Status of generation (processing/completed/failed)
3820
+ * data (Dict[str, str], optional): Generated text with fields:
3821
+ - llmstxt (str): Generated LLMs.txt content
3822
+ - llmsfulltxt (str, optional): Full version if requested
3823
+ * error (str, optional): Error message if generation failed
3824
+ * expiresAt (str): When the generated data expires
3825
+
3826
+ Raises:
3827
+ Exception: If generation fails
3828
+ """
3829
+ params = {}
3830
+ if max_urls is not None:
3831
+ params['maxUrls'] = max_urls
3832
+ if show_full_text is not None:
3833
+ params['showFullText'] = show_full_text
3834
+ if experimental_stream is not None:
3835
+ params['__experimental_stream'] = experimental_stream
3836
+
3837
+ response = await self.async_generate_llms_text(
3838
+ url,
3839
+ max_urls=max_urls,
3840
+ show_full_text=show_full_text,
3841
+ experimental_stream=experimental_stream
3842
+ )
3843
+ if not response.get('success') or 'id' not in response:
3844
+ return response
3845
+
3846
+ job_id = response['id']
3847
+ while True:
3848
+ status = await self.check_generate_llms_text_status(job_id)
3849
+
3850
+ if status['status'] == 'completed':
3851
+ return status
3852
+ elif status['status'] == 'failed':
3853
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
3854
+ elif status['status'] != 'processing':
3855
+ break
3856
+
3857
+ await asyncio.sleep(2)
3858
+
3859
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
3860
+
3861
+ async def async_generate_llms_text(
3862
+ self,
3863
+ url: str,
3864
+ *,
3865
+ max_urls: Optional[int] = None,
3866
+ show_full_text: Optional[bool] = None,
3867
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
3868
+ """
3869
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
3870
+
3871
+ Args:
3872
+ url (str): Target URL to generate LLMs.txt from
3873
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3874
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3875
+ experimental_stream (Optional[bool]): Enable experimental streaming
3876
+
3877
+ Returns:
3878
+ GenerateLLMsTextResponse containing:
3879
+ * success (bool): Whether job started successfully
3880
+ * id (str): Unique identifier for the job
3881
+ * error (str, optional): Error message if start failed
3882
+
3883
+ Raises:
3884
+ ValueError: If job initiation fails
3885
+ """
3886
+ params = {}
3887
+ if max_urls is not None:
3888
+ params['maxUrls'] = max_urls
3889
+ if show_full_text is not None:
3890
+ params['showFullText'] = show_full_text
3891
+ if experimental_stream is not None:
3892
+ params['__experimental_stream'] = experimental_stream
3893
+
3894
+ params = GenerateLLMsTextParams(
3895
+ maxUrls=max_urls,
3896
+ showFullText=show_full_text,
3897
+ __experimental_stream=experimental_stream
3898
+ )
3899
+
3900
+ headers = self._prepare_headers()
3901
+ json_data = {'url': url, **params.dict(exclude_none=True)}
3902
+ json_data['origin'] = f"python-sdk@{version}"
3903
+
3904
+ try:
3905
+ return await self._async_post_request(
3906
+ f'{self.api_url}/v1/llmstxt',
3907
+ json_data,
3908
+ headers
3909
+ )
3910
+ except Exception as e:
3911
+ raise ValueError(str(e))
3912
+
3913
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
3914
+ """
3915
+ Check the status of an asynchronous LLMs.txt generation job.
3916
+
3917
+ Args:
3918
+ id (str): The ID of the generation job
3919
+
3920
+ Returns:
3921
+ GenerateLLMsTextStatusResponse containing:
3922
+ * success (bool): Whether generation completed successfully
3923
+ * status (str): Status of generation (processing/completed/failed)
3924
+ * data (Dict[str, str], optional): Generated text with fields:
3925
+ - llmstxt (str): Generated LLMs.txt content
3926
+ - llmsfulltxt (str, optional): Full version if requested
3927
+ * error (str, optional): Error message if generation failed
3928
+ * expiresAt (str): When the generated data expires
3929
+
3930
+ Raises:
3931
+ ValueError: If status check fails
3932
+ """
3933
+ headers = self._prepare_headers()
3934
+ try:
3935
+ return await self._async_get_request(
3936
+ f'{self.api_url}/v1/llmstxt/{id}',
3937
+ headers
3938
+ )
3939
+ except Exception as e:
3940
+ raise ValueError(str(e))
3941
+
3942
+ async def deep_research(
3943
+ self,
3944
+ query: str,
3945
+ *,
3946
+ max_depth: Optional[int] = None,
3947
+ time_limit: Optional[int] = None,
3948
+ max_urls: Optional[int] = None,
3949
+ analysis_prompt: Optional[str] = None,
3950
+ system_prompt: Optional[str] = None,
3951
+ __experimental_stream_steps: Optional[bool] = None,
3952
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
3953
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
3954
+ """
3955
+ Initiates a deep research operation on a given query and polls until completion.
3956
+
3957
+ Args:
3958
+ query (str): Research query or topic to investigate
3959
+ max_depth (Optional[int]): Maximum depth of research exploration
3960
+ time_limit (Optional[int]): Time limit in seconds for research
3961
+ max_urls (Optional[int]): Maximum number of URLs to process
3962
+ analysis_prompt (Optional[str]): Custom prompt for analysis
3963
+ system_prompt (Optional[str]): Custom system prompt
3964
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
3965
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
3966
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
3967
+
3968
+ Returns:
3969
+ DeepResearchStatusResponse containing:
3970
+ * success (bool): Whether research completed successfully
3971
+ * status (str): Current state (processing/completed/failed)
3972
+ * error (Optional[str]): Error message if failed
3973
+ * id (str): Unique identifier for the research job
3974
+ * data (Any): Research findings and analysis
3975
+ * sources (List[Dict]): List of discovered sources
3976
+ * activities (List[Dict]): Research progress log
3977
+ * summaries (List[str]): Generated research summaries
3978
+
3979
+ Raises:
3980
+ Exception: If research fails
3981
+ """
3982
+ research_params = {}
3983
+ if max_depth is not None:
3984
+ research_params['maxDepth'] = max_depth
3985
+ if time_limit is not None:
3986
+ research_params['timeLimit'] = time_limit
3987
+ if max_urls is not None:
3988
+ research_params['maxUrls'] = max_urls
3989
+ if analysis_prompt is not None:
3990
+ research_params['analysisPrompt'] = analysis_prompt
3991
+ if system_prompt is not None:
3992
+ research_params['systemPrompt'] = system_prompt
3993
+ if __experimental_stream_steps is not None:
3994
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
3995
+ research_params = DeepResearchParams(**research_params)
3996
+
3997
+ response = await self.async_deep_research(
3998
+ query,
3999
+ max_depth=max_depth,
4000
+ time_limit=time_limit,
4001
+ max_urls=max_urls,
4002
+ analysis_prompt=analysis_prompt,
4003
+ system_prompt=system_prompt
4004
+ )
4005
+ if not response.get('success') or 'id' not in response:
4006
+ return response
4007
+
4008
+ job_id = response['id']
4009
+ last_activity_count = 0
4010
+ last_source_count = 0
4011
+
4012
+ while True:
4013
+ status = await self.check_deep_research_status(job_id)
4014
+
4015
+ if on_activity and 'activities' in status:
4016
+ new_activities = status['activities'][last_activity_count:]
4017
+ for activity in new_activities:
4018
+ on_activity(activity)
4019
+ last_activity_count = len(status['activities'])
4020
+
4021
+ if on_source and 'sources' in status:
4022
+ new_sources = status['sources'][last_source_count:]
4023
+ for source in new_sources:
4024
+ on_source(source)
4025
+ last_source_count = len(status['sources'])
4026
+
4027
+ if status['status'] == 'completed':
4028
+ return status
4029
+ elif status['status'] == 'failed':
4030
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4031
+ elif status['status'] != 'processing':
4032
+ break
4033
+
4034
+ await asyncio.sleep(2)
4035
+
4036
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4037
+
4038
+ async def async_deep_research(
4039
+ self,
4040
+ query: str,
4041
+ *,
4042
+ max_depth: Optional[int] = None,
4043
+ time_limit: Optional[int] = None,
4044
+ max_urls: Optional[int] = None,
4045
+ analysis_prompt: Optional[str] = None,
4046
+ system_prompt: Optional[str] = None,
4047
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4048
+ """
4049
+ Initiates an asynchronous deep research operation.
4050
+
4051
+ Args:
4052
+ query (str): Research query or topic to investigate
4053
+ max_depth (Optional[int]): Maximum depth of research exploration
4054
+ time_limit (Optional[int]): Time limit in seconds for research
4055
+ max_urls (Optional[int]): Maximum number of URLs to process
4056
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4057
+ system_prompt (Optional[str]): Custom system prompt
4058
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4059
+
4060
+ Returns:
4061
+ Dict[str, Any]: A response containing:
4062
+ * success (bool): Whether the research initiation was successful
4063
+ * id (str): The unique identifier for the research job
4064
+ * error (str, optional): Error message if initiation failed
4065
+
4066
+ Raises:
4067
+ Exception: If the research initiation fails.
4068
+ """
4069
+ research_params = {}
4070
+ if max_depth is not None:
4071
+ research_params['maxDepth'] = max_depth
4072
+ if time_limit is not None:
4073
+ research_params['timeLimit'] = time_limit
4074
+ if max_urls is not None:
4075
+ research_params['maxUrls'] = max_urls
4076
+ if analysis_prompt is not None:
4077
+ research_params['analysisPrompt'] = analysis_prompt
4078
+ if system_prompt is not None:
4079
+ research_params['systemPrompt'] = system_prompt
4080
+ if __experimental_stream_steps is not None:
4081
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4082
+ research_params = DeepResearchParams(**research_params)
4083
+
4084
+ headers = self._prepare_headers()
4085
+
4086
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4087
+ json_data['origin'] = f"python-sdk@{version}"
4088
+
4089
+ try:
4090
+ return await self._async_post_request(
4091
+ f'{self.api_url}/v1/deep-research',
4092
+ json_data,
4093
+ headers
4094
+ )
4095
+ except Exception as e:
4096
+ raise ValueError(str(e))
4097
+
4098
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4099
+ """
4100
+ Check the status of a deep research operation.
4101
+
4102
+ Args:
4103
+ id (str): The ID of the deep research operation.
4104
+
4105
+ Returns:
4106
+ DeepResearchResponse containing:
4107
+
4108
+ Status:
4109
+ * success - Whether research completed successfully
4110
+ * status - Current state (processing/completed/failed)
4111
+ * error - Error message if failed
4112
+
4113
+ Results:
4114
+ * id - Unique identifier for the research job
4115
+ * data - Research findings and analysis
4116
+ * sources - List of discovered sources
4117
+ * activities - Research progress log
4118
+ * summaries - Generated research summaries
4119
+
4120
+ Raises:
4121
+ Exception: If the status check fails.
4122
+ """
4123
+ headers = self._prepare_headers()
4124
+ try:
4125
+ return await self._async_get_request(
4126
+ f'{self.api_url}/v1/deep-research/{id}',
4127
+ headers
4128
+ )
4129
+ except Exception as e:
4130
+ raise ValueError(str(e))
4131
+
4132
+ async def search(
4133
+ self,
4134
+ query: str,
4135
+ *,
4136
+ limit: Optional[int] = None,
4137
+ tbs: Optional[str] = None,
4138
+ filter: Optional[str] = None,
4139
+ lang: Optional[str] = None,
4140
+ country: Optional[str] = None,
4141
+ location: Optional[str] = None,
4142
+ timeout: Optional[int] = None,
4143
+ scrape_options: Optional[ScrapeOptions] = None,
4144
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4145
+ **kwargs) -> SearchResponse:
4146
+ """
4147
+ Asynchronously search for content using Firecrawl.
4148
+
4149
+ Args:
4150
+ query (str): Search query string
4151
+ limit (Optional[int]): Max results (default: 5)
4152
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4153
+ filter (Optional[str]): Custom result filter
4154
+ lang (Optional[str]): Language code (default: "en")
4155
+ country (Optional[str]): Country code (default: "us")
4156
+ location (Optional[str]): Geo-targeting
4157
+ timeout (Optional[int]): Request timeout in milliseconds
4158
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4159
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4160
+ **kwargs: Additional keyword arguments for future compatibility
4161
+
4162
+ Returns:
4163
+ SearchResponse: Response containing:
4164
+ * success (bool): Whether request succeeded
4165
+ * data (List[FirecrawlDocument]): Search results
4166
+ * warning (Optional[str]): Warning message if any
4167
+ * error (Optional[str]): Error message if any
4168
+
4169
+ Raises:
4170
+ Exception: If search fails or response cannot be parsed
4171
+ """
4172
+ # Build search parameters
4173
+ search_params = {}
4174
+ if params:
4175
+ if isinstance(params, dict):
4176
+ search_params.update(params)
4177
+ else:
4178
+ search_params.update(params.dict(exclude_none=True))
4179
+
4180
+ # Add individual parameters
4181
+ if limit is not None:
4182
+ search_params['limit'] = limit
4183
+ if tbs is not None:
4184
+ search_params['tbs'] = tbs
4185
+ if filter is not None:
4186
+ search_params['filter'] = filter
4187
+ if lang is not None:
4188
+ search_params['lang'] = lang
4189
+ if country is not None:
4190
+ search_params['country'] = country
4191
+ if location is not None:
4192
+ search_params['location'] = location
4193
+ if timeout is not None:
4194
+ search_params['timeout'] = timeout
4195
+ if scrape_options is not None:
4196
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4197
+
4198
+ # Add any additional kwargs
4199
+ search_params.update(kwargs)
4200
+
4201
+ # Create final params object
4202
+ final_params = SearchParams(query=query, **search_params)
4203
+ params_dict = final_params.dict(exclude_none=True)
4204
+ params_dict['origin'] = f"python-sdk@{version}"
4205
+
4206
+ return await self._async_post_request(
4207
+ f"{self.api_url}/v1/search",
4208
+ params_dict,
4209
+ {"Authorization": f"Bearer {self.api_key}"}
4210
+ )
4211
+
4212
+ class AsyncCrawlWatcher(CrawlWatcher):
4213
+ """
4214
+ Async version of CrawlWatcher that properly handles async operations.
4215
+ """
4216
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4217
+ super().__init__(id, app)
4218
+
4219
+ async def connect(self) -> None:
4220
+ """
4221
+ Establishes async WebSocket connection and starts listening for messages.
4222
+ """
4223
+ async with websockets.connect(
4224
+ self.ws_url,
4225
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4226
+ ) as websocket:
4227
+ await self._listen(websocket)
4228
+
4229
+ async def _listen(self, websocket) -> None:
4230
+ """
4231
+ Listens for incoming WebSocket messages and handles them asynchronously.
4232
+
4233
+ Args:
4234
+ websocket: The WebSocket connection object
4235
+ """
4236
+ async for message in websocket:
4237
+ msg = json.loads(message)
4238
+ await self._handle_message(msg)
4239
+
4240
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4241
+ """
4242
+ Handles incoming WebSocket messages based on their type asynchronously.
4243
+
4244
+ Args:
4245
+ msg (Dict[str, Any]): The message to handle
4246
+ """
4247
+ if msg['type'] == 'done':
4248
+ self.status = 'completed'
4249
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4250
+ elif msg['type'] == 'error':
4251
+ self.status = 'failed'
4252
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4253
+ elif msg['type'] == 'catchup':
4254
+ self.status = msg['data']['status']
4255
+ self.data.extend(msg['data'].get('data', []))
4256
+ for doc in self.data:
4257
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4258
+ elif msg['type'] == 'document':
4259
+ self.data.append(msg['data'])
4260
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4261
+
4262
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4263
+ """
4264
+ Handle errors from async API responses.
4265
+ """
4266
+ try:
4267
+ error_data = await response.json()
4268
+ error_message = error_data.get('error', 'No error message provided.')
4269
+ error_details = error_data.get('details', 'No additional error details provided.')
4270
+ except:
4271
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4272
+
4273
+ # Use the app's method to get the error message
4274
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4275
+
4276
+ raise aiohttp.ClientError(message)
4277
+
4278
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4279
+ """
4280
+ Generate a standardized error message based on HTTP status code for async operations.
4281
+
4282
+ Args:
4283
+ status_code (int): The HTTP status code from the response
4284
+ action (str): Description of the action that was being performed
4285
+ error_message (str): The error message from the API response
4286
+ error_details (str): Additional error details from the API response
4287
+
4288
+ Returns:
4289
+ str: A formatted error message
4290
+ """
4291
+ return self._get_error_message(status_code, action, error_message, error_details)