firecrawl-py 2.12.0__py3-none-any.whl → 2.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

@@ -0,0 +1,4613 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+ import ssl
27
+ import certifi
28
+
29
+ # Suppress Pydantic warnings about attribute shadowing
30
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
32
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
33
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
34
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
35
+
36
+ def get_version():
37
+ try:
38
+ from pathlib import Path
39
+ package_path = os.path.dirname(__file__)
40
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
41
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
42
+ if version_match:
43
+ return version_match.group(1).strip()
44
+ except Exception:
45
+ print("Failed to get version from __init__.py")
46
+ return None
47
+
48
+ version = get_version()
49
+
50
+ logger : logging.Logger = logging.getLogger("firecrawl")
51
+
52
+ T = TypeVar('T')
53
+
54
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
55
+ # """Metadata for a Firecrawl document."""
56
+ # title: Optional[str] = None
57
+ # description: Optional[str] = None
58
+ # language: Optional[str] = None
59
+ # keywords: Optional[str] = None
60
+ # robots: Optional[str] = None
61
+ # ogTitle: Optional[str] = None
62
+ # ogDescription: Optional[str] = None
63
+ # ogUrl: Optional[str] = None
64
+ # ogImage: Optional[str] = None
65
+ # ogAudio: Optional[str] = None
66
+ # ogDeterminer: Optional[str] = None
67
+ # ogLocale: Optional[str] = None
68
+ # ogLocaleAlternate: Optional[List[str]] = None
69
+ # ogSiteName: Optional[str] = None
70
+ # ogVideo: Optional[str] = None
71
+ # dctermsCreated: Optional[str] = None
72
+ # dcDateCreated: Optional[str] = None
73
+ # dcDate: Optional[str] = None
74
+ # dctermsType: Optional[str] = None
75
+ # dcType: Optional[str] = None
76
+ # dctermsAudience: Optional[str] = None
77
+ # dctermsSubject: Optional[str] = None
78
+ # dcSubject: Optional[str] = None
79
+ # dcDescription: Optional[str] = None
80
+ # dctermsKeywords: Optional[str] = None
81
+ # modifiedTime: Optional[str] = None
82
+ # publishedTime: Optional[str] = None
83
+ # articleTag: Optional[str] = None
84
+ # articleSection: Optional[str] = None
85
+ # sourceURL: Optional[str] = None
86
+ # statusCode: Optional[int] = None
87
+ # error: Optional[str] = None
88
+
89
+ class AgentOptions(pydantic.BaseModel):
90
+ """Configuration for the agent."""
91
+ model: Literal["FIRE-1"] = "FIRE-1"
92
+ prompt: Optional[str] = None
93
+
94
+ class AgentOptionsExtract(pydantic.BaseModel):
95
+ """Configuration for the agent in extract operations."""
96
+ model: Literal["FIRE-1"] = "FIRE-1"
97
+
98
+ class ActionsResult(pydantic.BaseModel):
99
+ """Result of actions performed during scraping."""
100
+ screenshots: List[str]
101
+ pdfs: List[str]
102
+
103
+ class ChangeTrackingData(pydantic.BaseModel):
104
+ """
105
+ Data for the change tracking format.
106
+ """
107
+ previousScrapeAt: Optional[str] = None
108
+ changeStatus: str # "new" | "same" | "changed" | "removed"
109
+ visibility: str # "visible" | "hidden"
110
+ diff: Optional[Dict[str, Any]] = None
111
+ json: Optional[Any] = None
112
+
113
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
114
+ """Document retrieved or processed by Firecrawl."""
115
+ url: Optional[str] = None
116
+ markdown: Optional[str] = None
117
+ html: Optional[str] = None
118
+ rawHtml: Optional[str] = None
119
+ links: Optional[List[str]] = None
120
+ extract: Optional[T] = None
121
+ json: Optional[T] = None
122
+ screenshot: Optional[str] = None
123
+ metadata: Optional[Any] = None
124
+ actions: Optional[ActionsResult] = None
125
+ title: Optional[str] = None # v1 search only
126
+ description: Optional[str] = None # v1 search only
127
+ changeTracking: Optional[ChangeTrackingData] = None
128
+
129
+ class LocationConfig(pydantic.BaseModel):
130
+ """Location configuration for scraping."""
131
+ country: Optional[str] = None
132
+ languages: Optional[List[str]] = None
133
+
134
+ class WebhookConfig(pydantic.BaseModel):
135
+ """Configuration for webhooks."""
136
+ url: str
137
+ headers: Optional[Dict[str, str]] = None
138
+ metadata: Optional[Dict[str, str]] = None
139
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
140
+
141
+ class ChangeTrackingOptions(pydantic.BaseModel):
142
+ """Configuration for change tracking."""
143
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
144
+ schema: Optional[Any] = None
145
+ prompt: Optional[str] = None
146
+ tag: Optional[str] = None
147
+
148
+ class ScrapeOptions(pydantic.BaseModel):
149
+ """Parameters for scraping operations."""
150
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
151
+ headers: Optional[Dict[str, str]] = None
152
+ includeTags: Optional[List[str]] = None
153
+ excludeTags: Optional[List[str]] = None
154
+ onlyMainContent: Optional[bool] = None
155
+ waitFor: Optional[int] = None
156
+ timeout: Optional[int] = None
157
+ location: Optional[LocationConfig] = None
158
+ mobile: Optional[bool] = None
159
+ skipTlsVerification: Optional[bool] = None
160
+ removeBase64Images: Optional[bool] = None
161
+ blockAds: Optional[bool] = None
162
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
163
+ changeTrackingOptions: Optional[ChangeTrackingOptions] = None
164
+ maxAge: Optional[int] = None
165
+ storeInCache: Optional[bool] = None
166
+ parsePDF: Optional[bool] = None
167
+
168
+ class WaitAction(pydantic.BaseModel):
169
+ """Wait action to perform during scraping."""
170
+ type: Literal["wait"]
171
+ milliseconds: Optional[int] = None
172
+ selector: Optional[str] = None
173
+
174
+ class ScreenshotAction(pydantic.BaseModel):
175
+ """Screenshot action to perform during scraping."""
176
+ type: Literal["screenshot"]
177
+ fullPage: Optional[bool] = None
178
+ quality: Optional[int] = None
179
+
180
+ class ClickAction(pydantic.BaseModel):
181
+ """Click action to perform during scraping."""
182
+ type: Literal["click"]
183
+ selector: str
184
+
185
+ class WriteAction(pydantic.BaseModel):
186
+ """Write action to perform during scraping."""
187
+ type: Literal["write"]
188
+ text: str
189
+
190
+ class PressAction(pydantic.BaseModel):
191
+ """Press action to perform during scraping."""
192
+ type: Literal["press"]
193
+ key: str
194
+
195
+ class ScrollAction(pydantic.BaseModel):
196
+ """Scroll action to perform during scraping."""
197
+ type: Literal["scroll"]
198
+ direction: Literal["up", "down"]
199
+ selector: Optional[str] = None
200
+
201
+ class ScrapeAction(pydantic.BaseModel):
202
+ """Scrape action to perform during scraping."""
203
+ type: Literal["scrape"]
204
+
205
+ class ExecuteJavascriptAction(pydantic.BaseModel):
206
+ """Execute javascript action to perform during scraping."""
207
+ type: Literal["executeJavascript"]
208
+ script: str
209
+
210
+ class PDFAction(pydantic.BaseModel):
211
+ """PDF action to perform during scraping."""
212
+ type: Literal["pdf"]
213
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
214
+ landscape: Optional[bool] = None
215
+ scale: Optional[float] = None
216
+
217
+ class ExtractAgent(pydantic.BaseModel):
218
+ """Configuration for the agent in extract operations."""
219
+ model: Literal["FIRE-1"] = "FIRE-1"
220
+
221
+ class JsonConfig(pydantic.BaseModel):
222
+ """Configuration for extraction."""
223
+ prompt: Optional[str] = None
224
+ schema: Optional[Any] = None
225
+ systemPrompt: Optional[str] = None
226
+ agent: Optional[ExtractAgent] = None
227
+
228
+ class ScrapeParams(ScrapeOptions):
229
+ """Parameters for scraping operations."""
230
+ extract: Optional[JsonConfig] = None
231
+ jsonOptions: Optional[JsonConfig] = None
232
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
233
+ agent: Optional[AgentOptions] = None
234
+ webhook: Optional[WebhookConfig] = None
235
+
236
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
237
+ """Response from scraping operations."""
238
+ success: bool = True
239
+ warning: Optional[str] = None
240
+ error: Optional[str] = None
241
+
242
+ class BatchScrapeResponse(pydantic.BaseModel):
243
+ """Response from batch scrape operations."""
244
+ id: Optional[str] = None
245
+ url: Optional[str] = None
246
+ success: bool = True
247
+ error: Optional[str] = None
248
+ invalidURLs: Optional[List[str]] = None
249
+
250
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
251
+ """Response from batch scrape status checks."""
252
+ success: bool = True
253
+ status: Literal["scraping", "completed", "failed", "cancelled"]
254
+ completed: int
255
+ total: int
256
+ creditsUsed: int
257
+ expiresAt: datetime
258
+ next: Optional[str] = None
259
+ data: List[FirecrawlDocument]
260
+
261
+ class CrawlParams(pydantic.BaseModel):
262
+ """Parameters for crawling operations."""
263
+ includePaths: Optional[List[str]] = None
264
+ excludePaths: Optional[List[str]] = None
265
+ maxDepth: Optional[int] = None
266
+ maxDiscoveryDepth: Optional[int] = None
267
+ limit: Optional[int] = None
268
+ allowBackwardLinks: Optional[bool] = None
269
+ allowExternalLinks: Optional[bool] = None
270
+ ignoreSitemap: Optional[bool] = None
271
+ scrapeOptions: Optional[ScrapeOptions] = None
272
+ webhook: Optional[Union[str, WebhookConfig]] = None
273
+ deduplicateSimilarURLs: Optional[bool] = None
274
+ ignoreQueryParameters: Optional[bool] = None
275
+ regexOnFullURL: Optional[bool] = None
276
+ delay: Optional[int] = None # Delay in seconds between scrapes
277
+ maxConcurrency: Optional[int] = None
278
+ allowSubdomains: Optional[bool] = None
279
+
280
+ class CrawlResponse(pydantic.BaseModel):
281
+ """Response from crawling operations."""
282
+ id: Optional[str] = None
283
+ url: Optional[str] = None
284
+ success: bool = True
285
+ error: Optional[str] = None
286
+
287
+ class CrawlStatusResponse(pydantic.BaseModel):
288
+ """Response from crawl status checks."""
289
+ success: bool = True
290
+ status: Literal["scraping", "completed", "failed", "cancelled"]
291
+ completed: int
292
+ total: int
293
+ creditsUsed: int
294
+ expiresAt: datetime
295
+ next: Optional[str] = None
296
+ data: List[FirecrawlDocument]
297
+
298
+ class CrawlErrorsResponse(pydantic.BaseModel):
299
+ """Response from crawl/batch scrape error monitoring."""
300
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
301
+ robotsBlocked: List[str]
302
+
303
+ class MapParams(pydantic.BaseModel):
304
+ """Parameters for mapping operations."""
305
+ search: Optional[str] = None
306
+ ignoreSitemap: Optional[bool] = None
307
+ includeSubdomains: Optional[bool] = None
308
+ sitemapOnly: Optional[bool] = None
309
+ limit: Optional[int] = None
310
+ timeout: Optional[int] = None
311
+ useIndex: Optional[bool] = None
312
+
313
+ class MapResponse(pydantic.BaseModel):
314
+ """Response from mapping operations."""
315
+ success: bool = True
316
+ links: Optional[List[str]] = None
317
+ error: Optional[str] = None
318
+
319
+ class ExtractParams(pydantic.BaseModel):
320
+ """Parameters for extracting information from URLs."""
321
+ prompt: Optional[str] = None
322
+ schema: Optional[Any] = None
323
+ systemPrompt: Optional[str] = None
324
+ allowExternalLinks: Optional[bool] = None
325
+ enableWebSearch: Optional[bool] = None
326
+ includeSubdomains: Optional[bool] = None
327
+ origin: Optional[str] = None
328
+ showSources: Optional[bool] = None
329
+ scrapeOptions: Optional[ScrapeOptions] = None
330
+
331
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
332
+ """Response from extract operations."""
333
+ id: Optional[str] = None
334
+ status: Optional[Literal["processing", "completed", "failed"]] = None
335
+ expiresAt: Optional[datetime] = None
336
+ success: bool = True
337
+ data: Optional[T] = None
338
+ error: Optional[str] = None
339
+ warning: Optional[str] = None
340
+ sources: Optional[Dict[Any, Any]] = None
341
+
342
+ class SearchParams(pydantic.BaseModel):
343
+ query: str
344
+ limit: Optional[int] = 5
345
+ tbs: Optional[str] = None
346
+ filter: Optional[str] = None
347
+ lang: Optional[str] = "en"
348
+ country: Optional[str] = "us"
349
+ location: Optional[str] = None
350
+ origin: Optional[str] = "api"
351
+ timeout: Optional[int] = 60000
352
+ scrapeOptions: Optional[ScrapeOptions] = None
353
+
354
+ class SearchResponse(pydantic.BaseModel):
355
+ """Response from search operations."""
356
+ success: bool = True
357
+ data: List[FirecrawlDocument]
358
+ warning: Optional[str] = None
359
+ error: Optional[str] = None
360
+
361
+ class GenerateLLMsTextParams(pydantic.BaseModel):
362
+ """
363
+ Parameters for the LLMs.txt generation operation.
364
+ """
365
+ maxUrls: Optional[int] = 10
366
+ showFullText: Optional[bool] = False
367
+ cache: Optional[bool] = True
368
+ __experimental_stream: Optional[bool] = None
369
+
370
+ class DeepResearchParams(pydantic.BaseModel):
371
+ """
372
+ Parameters for the deep research operation.
373
+ """
374
+ maxDepth: Optional[int] = 7
375
+ timeLimit: Optional[int] = 270
376
+ maxUrls: Optional[int] = 20
377
+ analysisPrompt: Optional[str] = None
378
+ systemPrompt: Optional[str] = None
379
+ __experimental_streamSteps: Optional[bool] = None
380
+
381
+ class DeepResearchResponse(pydantic.BaseModel):
382
+ """
383
+ Response from the deep research operation.
384
+ """
385
+ success: bool
386
+ id: str
387
+ error: Optional[str] = None
388
+
389
+ class DeepResearchStatusResponse(pydantic.BaseModel):
390
+ """
391
+ Status response from the deep research operation.
392
+ """
393
+ success: bool
394
+ data: Optional[Dict[str, Any]] = None
395
+ status: str
396
+ error: Optional[str] = None
397
+ expiresAt: str
398
+ currentDepth: int
399
+ maxDepth: int
400
+ activities: List[Dict[str, Any]]
401
+ sources: List[Dict[str, Any]]
402
+ summaries: List[str]
403
+
404
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
405
+ """Response from LLMs.txt generation operations."""
406
+ success: bool = True
407
+ id: str
408
+ error: Optional[str] = None
409
+
410
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
411
+ llmstxt: str
412
+ llmsfulltxt: Optional[str] = None
413
+
414
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
415
+ """Status response from LLMs.txt generation operations."""
416
+ success: bool = True
417
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
418
+ status: Literal["processing", "completed", "failed"]
419
+ error: Optional[str] = None
420
+ expiresAt: str
421
+
422
+ class SearchResponse(pydantic.BaseModel):
423
+ """
424
+ Response from the search operation.
425
+ """
426
+ success: bool
427
+ data: List[Dict[str, Any]]
428
+ warning: Optional[str] = None
429
+ error: Optional[str] = None
430
+
431
+ class ExtractParams(pydantic.BaseModel):
432
+ """
433
+ Parameters for the extract operation.
434
+ """
435
+ prompt: Optional[str] = None
436
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
437
+ system_prompt: Optional[str] = None
438
+ allow_external_links: Optional[bool] = False
439
+ enable_web_search: Optional[bool] = False
440
+ # Just for backwards compatibility
441
+ enableWebSearch: Optional[bool] = False
442
+ show_sources: Optional[bool] = False
443
+ agent: Optional[Dict[str, Any]] = None
444
+
445
+ class FirecrawlApp:
446
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
447
+ """
448
+ Initialize the FirecrawlApp instance with API key, API URL.
449
+
450
+ Args:
451
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
452
+ api_url (Optional[str]): Base URL for the Firecrawl API.
453
+ """
454
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
455
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
456
+
457
+ # Only require API key when using cloud service
458
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
459
+ logger.warning("No API key provided for cloud service")
460
+ raise ValueError('No API key provided')
461
+
462
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
463
+
464
+ def scrape_url(
465
+ self,
466
+ url: str,
467
+ *,
468
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
469
+ include_tags: Optional[List[str]] = None,
470
+ exclude_tags: Optional[List[str]] = None,
471
+ only_main_content: Optional[bool] = None,
472
+ wait_for: Optional[int] = None,
473
+ timeout: Optional[int] = None,
474
+ location: Optional[LocationConfig] = None,
475
+ mobile: Optional[bool] = None,
476
+ skip_tls_verification: Optional[bool] = None,
477
+ remove_base64_images: Optional[bool] = None,
478
+ block_ads: Optional[bool] = None,
479
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
480
+ parse_pdf: Optional[bool] = None,
481
+ extract: Optional[JsonConfig] = None,
482
+ json_options: Optional[JsonConfig] = None,
483
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
484
+ change_tracking_options: Optional[ChangeTrackingOptions] = None,
485
+ max_age: Optional[int] = None,
486
+ store_in_cache: Optional[bool] = None,
487
+ zero_data_retention: Optional[bool] = None,
488
+ **kwargs) -> ScrapeResponse[Any]:
489
+ """
490
+ Scrape and extract content from a URL.
491
+
492
+ Args:
493
+ url (str): Target URL to scrape
494
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
495
+ include_tags (Optional[List[str]]): HTML tags to include
496
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
497
+ only_main_content (Optional[bool]): Extract main content only
498
+ wait_for (Optional[int]): Wait for a specific element to appear
499
+ timeout (Optional[int]): Request timeout (ms)
500
+ location (Optional[LocationConfig]): Location configuration
501
+ mobile (Optional[bool]): Use mobile user agent
502
+ skip_tls_verification (Optional[bool]): Skip TLS verification
503
+ remove_base64_images (Optional[bool]): Remove base64 images
504
+ block_ads (Optional[bool]): Block ads
505
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
506
+ extract (Optional[JsonConfig]): Content extraction settings
507
+ json_options (Optional[JsonConfig]): JSON extraction settings
508
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
509
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
510
+ zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
511
+
512
+
513
+ Returns:
514
+ ScrapeResponse with:
515
+ * Requested content formats
516
+ * Page metadata
517
+ * Extraction results
518
+ * Success/error status
519
+
520
+ Raises:
521
+ Exception: If scraping fails
522
+ """
523
+ headers = self._prepare_headers()
524
+
525
+ # Build scrape parameters
526
+ scrape_params = {
527
+ 'url': url,
528
+ 'origin': f"python-sdk@{version}"
529
+ }
530
+
531
+ # Add optional parameters if provided
532
+ if formats:
533
+ scrape_params['formats'] = formats
534
+ if include_tags:
535
+ scrape_params['includeTags'] = include_tags
536
+ if exclude_tags:
537
+ scrape_params['excludeTags'] = exclude_tags
538
+ if only_main_content is not None:
539
+ scrape_params['onlyMainContent'] = only_main_content
540
+ if wait_for:
541
+ scrape_params['waitFor'] = wait_for
542
+ if timeout:
543
+ scrape_params['timeout'] = timeout
544
+ if location:
545
+ scrape_params['location'] = location.dict(exclude_none=True)
546
+ if mobile is not None:
547
+ scrape_params['mobile'] = mobile
548
+ if skip_tls_verification is not None:
549
+ scrape_params['skipTlsVerification'] = skip_tls_verification
550
+ if remove_base64_images is not None:
551
+ scrape_params['removeBase64Images'] = remove_base64_images
552
+ if block_ads is not None:
553
+ scrape_params['blockAds'] = block_ads
554
+ if proxy:
555
+ scrape_params['proxy'] = proxy
556
+ if parse_pdf is not None:
557
+ scrape_params['parsePDF'] = parse_pdf
558
+ if extract is not None:
559
+ extract = self._ensure_schema_dict(extract)
560
+ if isinstance(extract, dict) and "schema" in extract:
561
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
562
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
563
+ if json_options is not None:
564
+ json_options = self._ensure_schema_dict(json_options)
565
+ if isinstance(json_options, dict) and "schema" in json_options:
566
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
567
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
568
+ if actions:
569
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
570
+ if change_tracking_options:
571
+ scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
572
+ if max_age is not None:
573
+ scrape_params['maxAge'] = max_age
574
+ if store_in_cache is not None:
575
+ scrape_params['storeInCache'] = store_in_cache
576
+ if zero_data_retention is not None:
577
+ scrape_params['zeroDataRetention'] = zero_data_retention
578
+
579
+ scrape_params.update(kwargs)
580
+
581
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
582
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
583
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
584
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
585
+
586
+ # Make request
587
+ response = requests.post(
588
+ f'{self.api_url}/v1/scrape',
589
+ headers=headers,
590
+ json=scrape_params,
591
+ timeout=(timeout + 5000 if timeout else None)
592
+ )
593
+
594
+ if response.status_code == 200:
595
+ try:
596
+ response_json = response.json()
597
+ if response_json.get('success') and 'data' in response_json:
598
+ return ScrapeResponse(**response_json['data'])
599
+ elif "error" in response_json:
600
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
601
+ else:
602
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
603
+ except ValueError:
604
+ raise Exception('Failed to parse Firecrawl response as JSON.')
605
+ else:
606
+ self._handle_error(response, 'scrape URL')
607
+
608
+ def search(
609
+ self,
610
+ query: str,
611
+ *,
612
+ limit: Optional[int] = None,
613
+ tbs: Optional[str] = None,
614
+ filter: Optional[str] = None,
615
+ lang: Optional[str] = None,
616
+ country: Optional[str] = None,
617
+ location: Optional[str] = None,
618
+ timeout: Optional[int] = None,
619
+ scrape_options: Optional[ScrapeOptions] = None,
620
+ **kwargs) -> SearchResponse:
621
+ """
622
+ Search for content using Firecrawl.
623
+
624
+ Args:
625
+ query (str): Search query string
626
+ limit (Optional[int]): Max results (default: 5)
627
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
628
+ filter (Optional[str]): Custom result filter
629
+ lang (Optional[str]): Language code (default: "en")
630
+ country (Optional[str]): Country code (default: "us")
631
+ location (Optional[str]): Geo-targeting
632
+ timeout (Optional[int]): Request timeout in milliseconds
633
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
634
+ **kwargs: Additional keyword arguments for future compatibility
635
+
636
+ Returns:
637
+ SearchResponse: Response containing:
638
+ * success (bool): Whether request succeeded
639
+ * data (List[FirecrawlDocument]): Search results
640
+ * warning (Optional[str]): Warning message if any
641
+ * error (Optional[str]): Error message if any
642
+
643
+ Raises:
644
+ Exception: If search fails or response cannot be parsed
645
+ """
646
+ # Validate any additional kwargs
647
+ self._validate_kwargs(kwargs, "search")
648
+
649
+ # Build search parameters
650
+ search_params = {}
651
+
652
+ # Add individual parameters
653
+ if limit is not None:
654
+ search_params['limit'] = limit
655
+ if tbs is not None:
656
+ search_params['tbs'] = tbs
657
+ if filter is not None:
658
+ search_params['filter'] = filter
659
+ if lang is not None:
660
+ search_params['lang'] = lang
661
+ if country is not None:
662
+ search_params['country'] = country
663
+ if location is not None:
664
+ search_params['location'] = location
665
+ if timeout is not None:
666
+ search_params['timeout'] = timeout
667
+ if scrape_options is not None:
668
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
669
+
670
+ # Add any additional kwargs
671
+ search_params.update(kwargs)
672
+ _integration = search_params.get('integration')
673
+
674
+ # Create final params object
675
+ final_params = SearchParams(query=query, **search_params)
676
+ params_dict = final_params.dict(exclude_none=True)
677
+ params_dict['origin'] = f"python-sdk@{version}"
678
+
679
+ if _integration:
680
+ params_dict['integration'] = _integration
681
+
682
+ # Make request
683
+ response = requests.post(
684
+ f"{self.api_url}/v1/search",
685
+ headers={"Authorization": f"Bearer {self.api_key}"},
686
+ json=params_dict
687
+ )
688
+
689
+ if response.status_code == 200:
690
+ try:
691
+ response_json = response.json()
692
+ if response_json.get('success') and 'data' in response_json:
693
+ return SearchResponse(**response_json)
694
+ elif "error" in response_json:
695
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
696
+ else:
697
+ raise Exception(f'Search failed. Error: {response_json}')
698
+ except ValueError:
699
+ raise Exception('Failed to parse Firecrawl response as JSON.')
700
+ else:
701
+ self._handle_error(response, 'search')
702
+
703
+ def crawl_url(
704
+ self,
705
+ url: str,
706
+ *,
707
+ include_paths: Optional[List[str]] = None,
708
+ exclude_paths: Optional[List[str]] = None,
709
+ max_depth: Optional[int] = None,
710
+ max_discovery_depth: Optional[int] = None,
711
+ limit: Optional[int] = None,
712
+ allow_backward_links: Optional[bool] = None,
713
+ crawl_entire_domain: Optional[bool] = None,
714
+ allow_external_links: Optional[bool] = None,
715
+ ignore_sitemap: Optional[bool] = None,
716
+ scrape_options: Optional[ScrapeOptions] = None,
717
+ webhook: Optional[Union[str, WebhookConfig]] = None,
718
+ deduplicate_similar_urls: Optional[bool] = None,
719
+ ignore_query_parameters: Optional[bool] = None,
720
+ regex_on_full_url: Optional[bool] = None,
721
+ delay: Optional[int] = None,
722
+ allow_subdomains: Optional[bool] = None,
723
+ max_concurrency: Optional[int] = None,
724
+ zero_data_retention: Optional[bool] = None,
725
+ poll_interval: Optional[int] = 2,
726
+ idempotency_key: Optional[str] = None,
727
+ **kwargs
728
+ ) -> CrawlStatusResponse:
729
+ """
730
+ Crawl a website starting from a URL.
731
+
732
+ Args:
733
+ url (str): Target URL to start crawling from
734
+ include_paths (Optional[List[str]]): Patterns of URLs to include
735
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
736
+ max_depth (Optional[int]): Maximum crawl depth
737
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
738
+ limit (Optional[int]): Maximum pages to crawl
739
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
740
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
741
+ allow_external_links (Optional[bool]): Follow external domain links
742
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
743
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
744
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
745
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
746
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
747
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
748
+ delay (Optional[int]): Delay in seconds between scrapes
749
+ allow_subdomains (Optional[bool]): Follow subdomains
750
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
751
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
752
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
753
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
754
+ **kwargs: Additional parameters to pass to the API
755
+
756
+ Returns:
757
+ CrawlStatusResponse with:
758
+ * Crawling status and progress
759
+ * Crawled page contents
760
+ * Success/error information
761
+
762
+ Raises:
763
+ Exception: If crawl fails
764
+ """
765
+ # Validate any additional kwargs
766
+ self._validate_kwargs(kwargs, "crawl_url")
767
+
768
+ crawl_params = {}
769
+
770
+ # Add individual parameters
771
+ if include_paths is not None:
772
+ crawl_params['includePaths'] = include_paths
773
+ if exclude_paths is not None:
774
+ crawl_params['excludePaths'] = exclude_paths
775
+ if max_depth is not None:
776
+ crawl_params['maxDepth'] = max_depth
777
+ if max_discovery_depth is not None:
778
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
779
+ if limit is not None:
780
+ crawl_params['limit'] = limit
781
+ if crawl_entire_domain is not None:
782
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
783
+ elif allow_backward_links is not None:
784
+ crawl_params['allowBackwardLinks'] = allow_backward_links
785
+ if allow_external_links is not None:
786
+ crawl_params['allowExternalLinks'] = allow_external_links
787
+ if ignore_sitemap is not None:
788
+ crawl_params['ignoreSitemap'] = ignore_sitemap
789
+ if scrape_options is not None:
790
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
791
+ if webhook is not None:
792
+ crawl_params['webhook'] = webhook
793
+ if deduplicate_similar_urls is not None:
794
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
795
+ if ignore_query_parameters is not None:
796
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
797
+ if regex_on_full_url is not None:
798
+ crawl_params['regexOnFullURL'] = regex_on_full_url
799
+ if delay is not None:
800
+ crawl_params['delay'] = delay
801
+ if allow_subdomains is not None:
802
+ crawl_params['allowSubdomains'] = allow_subdomains
803
+ if max_concurrency is not None:
804
+ crawl_params['maxConcurrency'] = max_concurrency
805
+ if zero_data_retention is not None:
806
+ crawl_params['zeroDataRetention'] = zero_data_retention
807
+ # Add any additional kwargs
808
+ crawl_params.update(kwargs)
809
+ _integration = crawl_params.get('integration')
810
+
811
+ # Create final params object
812
+ final_params = CrawlParams(**crawl_params)
813
+ params_dict = final_params.dict(exclude_none=True)
814
+ params_dict['url'] = url
815
+ params_dict['origin'] = f"python-sdk@{version}"
816
+
817
+ if _integration:
818
+ params_dict['integration'] = _integration
819
+
820
+ # Make request
821
+ headers = self._prepare_headers(idempotency_key)
822
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
823
+
824
+ if response.status_code == 200:
825
+ try:
826
+ id = response.json().get('id')
827
+ except:
828
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
829
+ return self._monitor_job_status(id, headers, poll_interval)
830
+ else:
831
+ self._handle_error(response, 'start crawl job')
832
+
833
+ def async_crawl_url(
834
+ self,
835
+ url: str,
836
+ *,
837
+ include_paths: Optional[List[str]] = None,
838
+ exclude_paths: Optional[List[str]] = None,
839
+ max_depth: Optional[int] = None,
840
+ max_discovery_depth: Optional[int] = None,
841
+ limit: Optional[int] = None,
842
+ allow_backward_links: Optional[bool] = None,
843
+ crawl_entire_domain: Optional[bool] = None,
844
+ allow_external_links: Optional[bool] = None,
845
+ ignore_sitemap: Optional[bool] = None,
846
+ scrape_options: Optional[ScrapeOptions] = None,
847
+ webhook: Optional[Union[str, WebhookConfig]] = None,
848
+ deduplicate_similar_urls: Optional[bool] = None,
849
+ ignore_query_parameters: Optional[bool] = None,
850
+ regex_on_full_url: Optional[bool] = None,
851
+ delay: Optional[int] = None,
852
+ allow_subdomains: Optional[bool] = None,
853
+ max_concurrency: Optional[int] = None,
854
+ zero_data_retention: Optional[bool] = None,
855
+ idempotency_key: Optional[str] = None,
856
+ **kwargs
857
+ ) -> CrawlResponse:
858
+ """
859
+ Start an asynchronous crawl job.
860
+
861
+ Args:
862
+ url (str): Target URL to start crawling from
863
+ include_paths (Optional[List[str]]): Patterns of URLs to include
864
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
865
+ max_depth (Optional[int]): Maximum crawl depth
866
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
867
+ limit (Optional[int]): Maximum pages to crawl
868
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
869
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
870
+ allow_external_links (Optional[bool]): Follow external domain links
871
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
872
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
873
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
874
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
875
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
876
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
877
+ delay (Optional[int]): Delay in seconds between scrapes
878
+ allow_subdomains (Optional[bool]): Follow subdomains
879
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
880
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
881
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
882
+ **kwargs: Additional parameters to pass to the API
883
+
884
+ Returns:
885
+ CrawlResponse with:
886
+ * success - Whether crawl started successfully
887
+ * id - Unique identifier for the crawl job
888
+ * url - Status check URL for the crawl
889
+ * error - Error message if start failed
890
+
891
+ Raises:
892
+ Exception: If crawl initiation fails
893
+ """
894
+ # Validate any additional kwargs
895
+ self._validate_kwargs(kwargs, "async_crawl_url")
896
+
897
+ crawl_params = {}
898
+
899
+ # Add individual parameters
900
+ if include_paths is not None:
901
+ crawl_params['includePaths'] = include_paths
902
+ if exclude_paths is not None:
903
+ crawl_params['excludePaths'] = exclude_paths
904
+ if max_depth is not None:
905
+ crawl_params['maxDepth'] = max_depth
906
+ if max_discovery_depth is not None:
907
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
908
+ if limit is not None:
909
+ crawl_params['limit'] = limit
910
+ if crawl_entire_domain is not None:
911
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
912
+ elif allow_backward_links is not None:
913
+ crawl_params['allowBackwardLinks'] = allow_backward_links
914
+ if allow_external_links is not None:
915
+ crawl_params['allowExternalLinks'] = allow_external_links
916
+ if ignore_sitemap is not None:
917
+ crawl_params['ignoreSitemap'] = ignore_sitemap
918
+ if scrape_options is not None:
919
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
920
+ if webhook is not None:
921
+ crawl_params['webhook'] = webhook
922
+ if deduplicate_similar_urls is not None:
923
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
924
+ if ignore_query_parameters is not None:
925
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
926
+ if regex_on_full_url is not None:
927
+ crawl_params['regexOnFullURL'] = regex_on_full_url
928
+ if delay is not None:
929
+ crawl_params['delay'] = delay
930
+ if allow_subdomains is not None:
931
+ crawl_params['allowSubdomains'] = allow_subdomains
932
+ if max_concurrency is not None:
933
+ crawl_params['maxConcurrency'] = max_concurrency
934
+ if zero_data_retention is not None:
935
+ crawl_params['zeroDataRetention'] = zero_data_retention
936
+ # Add any additional kwargs
937
+ crawl_params.update(kwargs)
938
+
939
+ # Create final params object
940
+ final_params = CrawlParams(**crawl_params)
941
+ params_dict = final_params.dict(exclude_none=True)
942
+ params_dict['url'] = url
943
+ params_dict['origin'] = f"python-sdk@{version}"
944
+
945
+ # Make request
946
+ headers = self._prepare_headers(idempotency_key)
947
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
948
+
949
+ if response.status_code == 200:
950
+ try:
951
+ return CrawlResponse(**response.json())
952
+ except:
953
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
954
+ else:
955
+ self._handle_error(response, 'start crawl job')
956
+
957
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
958
+ """
959
+ Check the status and results of a crawl job.
960
+
961
+ Args:
962
+ id: Unique identifier for the crawl job
963
+
964
+ Returns:
965
+ CrawlStatusResponse containing:
966
+
967
+ Status Information:
968
+ * status - Current state (scraping/completed/failed/cancelled)
969
+ * completed - Number of pages crawled
970
+ * total - Total pages to crawl
971
+ * creditsUsed - API credits consumed
972
+ * expiresAt - Data expiration timestamp
973
+
974
+ Results:
975
+ * data - List of crawled documents
976
+ * next - URL for next page of results (if paginated)
977
+ * success - Whether status check succeeded
978
+ * error - Error message if failed
979
+
980
+ Raises:
981
+ Exception: If status check fails
982
+ """
983
+ endpoint = f'/v1/crawl/{id}'
984
+
985
+ headers = self._prepare_headers()
986
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
987
+ if response.status_code == 200:
988
+ try:
989
+ status_data = response.json()
990
+ except:
991
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
992
+ if status_data['status'] == 'completed':
993
+ if 'data' in status_data:
994
+ data = status_data['data']
995
+ while 'next' in status_data:
996
+ if len(status_data['data']) == 0:
997
+ break
998
+ next_url = status_data.get('next')
999
+ if not next_url:
1000
+ logger.warning("Expected 'next' URL is missing.")
1001
+ break
1002
+ try:
1003
+ status_response = self._get_request(next_url, headers)
1004
+ if status_response.status_code != 200:
1005
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1006
+ break
1007
+ try:
1008
+ next_data = status_response.json()
1009
+ except:
1010
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1011
+ data.extend(next_data.get('data', []))
1012
+ status_data = next_data
1013
+ except Exception as e:
1014
+ logger.error(f"Error during pagination request: {e}")
1015
+ break
1016
+ status_data['data'] = data
1017
+
1018
+ response = {
1019
+ 'status': status_data.get('status'),
1020
+ 'total': status_data.get('total'),
1021
+ 'completed': status_data.get('completed'),
1022
+ 'creditsUsed': status_data.get('creditsUsed'),
1023
+ 'expiresAt': status_data.get('expiresAt'),
1024
+ 'data': status_data.get('data')
1025
+ }
1026
+
1027
+ if 'error' in status_data:
1028
+ response['error'] = status_data['error']
1029
+
1030
+ if 'next' in status_data:
1031
+ response['next'] = status_data['next']
1032
+
1033
+ return CrawlStatusResponse(
1034
+ success=False if 'error' in status_data else True,
1035
+ **response
1036
+ )
1037
+ else:
1038
+ self._handle_error(response, 'check crawl status')
1039
+
1040
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
1041
+ """
1042
+ Returns information about crawl errors.
1043
+
1044
+ Args:
1045
+ id (str): The ID of the crawl job
1046
+
1047
+ Returns:
1048
+ CrawlErrorsResponse containing:
1049
+ * errors (List[Dict[str, str]]): List of errors with fields:
1050
+ - id (str): Error ID
1051
+ - timestamp (str): When the error occurred
1052
+ - url (str): URL that caused the error
1053
+ - error (str): Error message
1054
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1055
+
1056
+ Raises:
1057
+ Exception: If error check fails
1058
+ """
1059
+ headers = self._prepare_headers()
1060
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1061
+ if response.status_code == 200:
1062
+ try:
1063
+ return CrawlErrorsResponse(**response.json())
1064
+ except:
1065
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1066
+ else:
1067
+ self._handle_error(response, "check crawl errors")
1068
+
1069
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
1070
+ """
1071
+ Cancel an asynchronous crawl job.
1072
+
1073
+ Args:
1074
+ id (str): The ID of the crawl job to cancel
1075
+
1076
+ Returns:
1077
+ Dict[str, Any] containing:
1078
+ * success (bool): Whether cancellation was successful
1079
+ * error (str, optional): Error message if cancellation failed
1080
+
1081
+ Raises:
1082
+ Exception: If cancellation fails
1083
+ """
1084
+ headers = self._prepare_headers()
1085
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1086
+ if response.status_code == 200:
1087
+ try:
1088
+ return response.json()
1089
+ except:
1090
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1091
+ else:
1092
+ self._handle_error(response, "cancel crawl job")
1093
+
1094
+ def crawl_url_and_watch(
1095
+ self,
1096
+ url: str,
1097
+ *,
1098
+ include_paths: Optional[List[str]] = None,
1099
+ exclude_paths: Optional[List[str]] = None,
1100
+ max_depth: Optional[int] = None,
1101
+ max_discovery_depth: Optional[int] = None,
1102
+ limit: Optional[int] = None,
1103
+ allow_backward_links: Optional[bool] = None,
1104
+ crawl_entire_domain: Optional[bool] = None,
1105
+ allow_external_links: Optional[bool] = None,
1106
+ ignore_sitemap: Optional[bool] = None,
1107
+ scrape_options: Optional[ScrapeOptions] = None,
1108
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1109
+ deduplicate_similar_urls: Optional[bool] = None,
1110
+ ignore_query_parameters: Optional[bool] = None,
1111
+ regex_on_full_url: Optional[bool] = None,
1112
+ delay: Optional[int] = None,
1113
+ allow_subdomains: Optional[bool] = None,
1114
+ max_concurrency: Optional[int] = None,
1115
+ zero_data_retention: Optional[bool] = None,
1116
+ idempotency_key: Optional[str] = None,
1117
+ **kwargs
1118
+ ) -> 'CrawlWatcher':
1119
+ """
1120
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1121
+
1122
+ Args:
1123
+ url (str): Target URL to start crawling from
1124
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1125
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1126
+ max_depth (Optional[int]): Maximum crawl depth
1127
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1128
+ limit (Optional[int]): Maximum pages to crawl
1129
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1130
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
1131
+ allow_external_links (Optional[bool]): Follow external domain links
1132
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1133
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1134
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1135
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1136
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1137
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1138
+ delay (Optional[int]): Delay in seconds between scrapes
1139
+ allow_subdomains (Optional[bool]): Follow subdomains
1140
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1141
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1142
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1143
+ **kwargs: Additional parameters to pass to the API
1144
+
1145
+ Returns:
1146
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1147
+
1148
+ Raises:
1149
+ Exception: If crawl job fails to start
1150
+ """
1151
+ crawl_response = self.async_crawl_url(
1152
+ url,
1153
+ include_paths=include_paths,
1154
+ exclude_paths=exclude_paths,
1155
+ max_depth=max_depth,
1156
+ max_discovery_depth=max_discovery_depth,
1157
+ limit=limit,
1158
+ allow_backward_links=allow_backward_links,
1159
+ allow_external_links=allow_external_links,
1160
+ ignore_sitemap=ignore_sitemap,
1161
+ scrape_options=scrape_options,
1162
+ webhook=webhook,
1163
+ deduplicate_similar_urls=deduplicate_similar_urls,
1164
+ ignore_query_parameters=ignore_query_parameters,
1165
+ regex_on_full_url=regex_on_full_url,
1166
+ delay=delay,
1167
+ allow_subdomains=allow_subdomains,
1168
+ max_concurrency=max_concurrency,
1169
+ zero_data_retention=zero_data_retention,
1170
+ idempotency_key=idempotency_key,
1171
+ **kwargs
1172
+ )
1173
+ if crawl_response.success and crawl_response.id:
1174
+ return CrawlWatcher(crawl_response.id, self)
1175
+ else:
1176
+ raise Exception("Crawl job failed to start")
1177
+
1178
+ def map_url(
1179
+ self,
1180
+ url: str,
1181
+ *,
1182
+ search: Optional[str] = None,
1183
+ ignore_sitemap: Optional[bool] = None,
1184
+ include_subdomains: Optional[bool] = None,
1185
+ sitemap_only: Optional[bool] = None,
1186
+ limit: Optional[int] = None,
1187
+ timeout: Optional[int] = None,
1188
+ use_index: Optional[bool] = None,
1189
+ **kwargs) -> MapResponse:
1190
+ """
1191
+ Map and discover links from a URL.
1192
+
1193
+ Args:
1194
+ url (str): Target URL to map
1195
+ search (Optional[str]): Filter pattern for URLs
1196
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1197
+ include_subdomains (Optional[bool]): Include subdomain links
1198
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1199
+ limit (Optional[int]): Maximum URLs to return
1200
+ timeout (Optional[int]): Request timeout in milliseconds
1201
+ **kwargs: Additional parameters to pass to the API
1202
+
1203
+ Returns:
1204
+ MapResponse: Response containing:
1205
+ * success (bool): Whether request succeeded
1206
+ * links (List[str]): Discovered URLs
1207
+ * error (Optional[str]): Error message if any
1208
+
1209
+ Raises:
1210
+ Exception: If mapping fails or response cannot be parsed
1211
+ """
1212
+ # Validate any additional kwargs
1213
+ self._validate_kwargs(kwargs, "map_url")
1214
+
1215
+ # Build map parameters
1216
+ map_params = {}
1217
+
1218
+ # Add individual parameters
1219
+ if search is not None:
1220
+ map_params['search'] = search
1221
+ if ignore_sitemap is not None:
1222
+ map_params['ignoreSitemap'] = ignore_sitemap
1223
+ if include_subdomains is not None:
1224
+ map_params['includeSubdomains'] = include_subdomains
1225
+ if sitemap_only is not None:
1226
+ map_params['sitemapOnly'] = sitemap_only
1227
+ if limit is not None:
1228
+ map_params['limit'] = limit
1229
+ if timeout is not None:
1230
+ map_params['timeout'] = timeout
1231
+ if use_index is not None:
1232
+ map_params['useIndex'] = use_index
1233
+
1234
+ # Add any additional kwargs
1235
+ map_params.update(kwargs)
1236
+ _integration = map_params.get('integration')
1237
+
1238
+ # Create final params object
1239
+ final_params = MapParams(**map_params)
1240
+ params_dict = final_params.dict(exclude_none=True)
1241
+ params_dict['url'] = url
1242
+ params_dict['origin'] = f"python-sdk@{version}"
1243
+
1244
+ if _integration:
1245
+ params_dict['integration'] = _integration
1246
+
1247
+ # Make request
1248
+ response = requests.post(
1249
+ f"{self.api_url}/v1/map",
1250
+ headers={"Authorization": f"Bearer {self.api_key}"},
1251
+ json=params_dict
1252
+ )
1253
+
1254
+ if response.status_code == 200:
1255
+ try:
1256
+ response_json = response.json()
1257
+ if response_json.get('success') and 'links' in response_json:
1258
+ return MapResponse(**response_json)
1259
+ elif "error" in response_json:
1260
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1261
+ else:
1262
+ raise Exception(f'Map failed. Error: {response_json}')
1263
+ except ValueError:
1264
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1265
+ else:
1266
+ self._handle_error(response, 'map')
1267
+
1268
+ def batch_scrape_urls(
1269
+ self,
1270
+ urls: List[str],
1271
+ *,
1272
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1273
+ headers: Optional[Dict[str, str]] = None,
1274
+ include_tags: Optional[List[str]] = None,
1275
+ exclude_tags: Optional[List[str]] = None,
1276
+ only_main_content: Optional[bool] = None,
1277
+ wait_for: Optional[int] = None,
1278
+ timeout: Optional[int] = None,
1279
+ location: Optional[LocationConfig] = None,
1280
+ mobile: Optional[bool] = None,
1281
+ skip_tls_verification: Optional[bool] = None,
1282
+ remove_base64_images: Optional[bool] = None,
1283
+ block_ads: Optional[bool] = None,
1284
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1285
+ extract: Optional[JsonConfig] = None,
1286
+ json_options: Optional[JsonConfig] = None,
1287
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1288
+ agent: Optional[AgentOptions] = None,
1289
+ poll_interval: Optional[int] = 2,
1290
+ max_concurrency: Optional[int] = None,
1291
+ zero_data_retention: Optional[bool] = None,
1292
+ idempotency_key: Optional[str] = None,
1293
+ **kwargs
1294
+ ) -> BatchScrapeStatusResponse:
1295
+ """
1296
+ Batch scrape multiple URLs and monitor until completion.
1297
+
1298
+ Args:
1299
+ urls (List[str]): URLs to scrape
1300
+ formats (Optional[List[Literal]]): Content formats to retrieve
1301
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1302
+ include_tags (Optional[List[str]]): HTML tags to include
1303
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1304
+ only_main_content (Optional[bool]): Extract main content only
1305
+ wait_for (Optional[int]): Wait time in milliseconds
1306
+ timeout (Optional[int]): Request timeout in milliseconds
1307
+ location (Optional[LocationConfig]): Location configuration
1308
+ mobile (Optional[bool]): Use mobile user agent
1309
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1310
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1311
+ block_ads (Optional[bool]): Block advertisements
1312
+ proxy (Optional[Literal]): Proxy type to use
1313
+ extract (Optional[JsonConfig]): Content extraction config
1314
+ json_options (Optional[JsonConfig]): JSON extraction config
1315
+ actions (Optional[List[Union]]): Actions to perform
1316
+ agent (Optional[AgentOptions]): Agent configuration
1317
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1318
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1319
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1320
+ **kwargs: Additional parameters to pass to the API
1321
+
1322
+ Returns:
1323
+ BatchScrapeStatusResponse with:
1324
+ * Scraping status and progress
1325
+ * Scraped content for each URL
1326
+ * Success/error information
1327
+
1328
+ Raises:
1329
+ Exception: If batch scrape fails
1330
+ """
1331
+ # Validate any additional kwargs
1332
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1333
+
1334
+ scrape_params = {}
1335
+
1336
+ # Add individual parameters
1337
+ if formats is not None:
1338
+ scrape_params['formats'] = formats
1339
+ if headers is not None:
1340
+ scrape_params['headers'] = headers
1341
+ if include_tags is not None:
1342
+ scrape_params['includeTags'] = include_tags
1343
+ if exclude_tags is not None:
1344
+ scrape_params['excludeTags'] = exclude_tags
1345
+ if only_main_content is not None:
1346
+ scrape_params['onlyMainContent'] = only_main_content
1347
+ if wait_for is not None:
1348
+ scrape_params['waitFor'] = wait_for
1349
+ if timeout is not None:
1350
+ scrape_params['timeout'] = timeout
1351
+ if location is not None:
1352
+ scrape_params['location'] = location.dict(exclude_none=True)
1353
+ if mobile is not None:
1354
+ scrape_params['mobile'] = mobile
1355
+ if skip_tls_verification is not None:
1356
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1357
+ if remove_base64_images is not None:
1358
+ scrape_params['removeBase64Images'] = remove_base64_images
1359
+ if block_ads is not None:
1360
+ scrape_params['blockAds'] = block_ads
1361
+ if proxy is not None:
1362
+ scrape_params['proxy'] = proxy
1363
+ if extract is not None:
1364
+ extract = self._ensure_schema_dict(extract)
1365
+ if isinstance(extract, dict) and "schema" in extract:
1366
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1367
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1368
+ if json_options is not None:
1369
+ json_options = self._ensure_schema_dict(json_options)
1370
+ if isinstance(json_options, dict) and "schema" in json_options:
1371
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1372
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1373
+ if actions is not None:
1374
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1375
+ if agent is not None:
1376
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1377
+ if max_concurrency is not None:
1378
+ scrape_params['maxConcurrency'] = max_concurrency
1379
+ if zero_data_retention is not None:
1380
+ scrape_params['zeroDataRetention'] = zero_data_retention
1381
+
1382
+ # Add any additional kwargs
1383
+ scrape_params.update(kwargs)
1384
+
1385
+ # Create final params object
1386
+ final_params = ScrapeParams(**scrape_params)
1387
+ params_dict = final_params.dict(exclude_none=True)
1388
+ params_dict['urls'] = urls
1389
+ params_dict['origin'] = f"python-sdk@{version}"
1390
+
1391
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1392
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1393
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1394
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1395
+
1396
+ # Make request
1397
+ headers = self._prepare_headers(idempotency_key)
1398
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1399
+
1400
+ if response.status_code == 200:
1401
+ try:
1402
+ id = response.json().get('id')
1403
+ except:
1404
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1405
+ return self._monitor_job_status(id, headers, poll_interval)
1406
+ else:
1407
+ self._handle_error(response, 'start batch scrape job')
1408
+
1409
+ def async_batch_scrape_urls(
1410
+ self,
1411
+ urls: List[str],
1412
+ *,
1413
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1414
+ headers: Optional[Dict[str, str]] = None,
1415
+ include_tags: Optional[List[str]] = None,
1416
+ exclude_tags: Optional[List[str]] = None,
1417
+ only_main_content: Optional[bool] = None,
1418
+ wait_for: Optional[int] = None,
1419
+ timeout: Optional[int] = None,
1420
+ location: Optional[LocationConfig] = None,
1421
+ mobile: Optional[bool] = None,
1422
+ skip_tls_verification: Optional[bool] = None,
1423
+ remove_base64_images: Optional[bool] = None,
1424
+ block_ads: Optional[bool] = None,
1425
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1426
+ extract: Optional[JsonConfig] = None,
1427
+ json_options: Optional[JsonConfig] = None,
1428
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1429
+ agent: Optional[AgentOptions] = None,
1430
+ max_concurrency: Optional[int] = None,
1431
+ idempotency_key: Optional[str] = None,
1432
+ zero_data_retention: Optional[bool] = None,
1433
+ **kwargs
1434
+ ) -> BatchScrapeResponse:
1435
+ """
1436
+ Initiate a batch scrape job asynchronously.
1437
+
1438
+ Args:
1439
+ urls (List[str]): URLs to scrape
1440
+ formats (Optional[List[Literal]]): Content formats to retrieve
1441
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1442
+ include_tags (Optional[List[str]]): HTML tags to include
1443
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1444
+ only_main_content (Optional[bool]): Extract main content only
1445
+ wait_for (Optional[int]): Wait time in milliseconds
1446
+ timeout (Optional[int]): Request timeout in milliseconds
1447
+ location (Optional[LocationConfig]): Location configuration
1448
+ mobile (Optional[bool]): Use mobile user agent
1449
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1450
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1451
+ block_ads (Optional[bool]): Block advertisements
1452
+ proxy (Optional[Literal]): Proxy type to use
1453
+ extract (Optional[JsonConfig]): Content extraction config
1454
+ json_options (Optional[JsonConfig]): JSON extraction config
1455
+ actions (Optional[List[Union]]): Actions to perform
1456
+ agent (Optional[AgentOptions]): Agent configuration
1457
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1458
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1459
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1460
+ **kwargs: Additional parameters to pass to the API
1461
+
1462
+ Returns:
1463
+ BatchScrapeResponse with:
1464
+ * success - Whether job started successfully
1465
+ * id - Unique identifier for the job
1466
+ * url - Status check URL
1467
+ * error - Error message if start failed
1468
+
1469
+ Raises:
1470
+ Exception: If job initiation fails
1471
+ """
1472
+ # Validate any additional kwargs
1473
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1474
+
1475
+ scrape_params = {}
1476
+
1477
+ # Add individual parameters
1478
+ if formats is not None:
1479
+ scrape_params['formats'] = formats
1480
+ if headers is not None:
1481
+ scrape_params['headers'] = headers
1482
+ if include_tags is not None:
1483
+ scrape_params['includeTags'] = include_tags
1484
+ if exclude_tags is not None:
1485
+ scrape_params['excludeTags'] = exclude_tags
1486
+ if only_main_content is not None:
1487
+ scrape_params['onlyMainContent'] = only_main_content
1488
+ if wait_for is not None:
1489
+ scrape_params['waitFor'] = wait_for
1490
+ if timeout is not None:
1491
+ scrape_params['timeout'] = timeout
1492
+ if location is not None:
1493
+ scrape_params['location'] = location.dict(exclude_none=True)
1494
+ if mobile is not None:
1495
+ scrape_params['mobile'] = mobile
1496
+ if skip_tls_verification is not None:
1497
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1498
+ if remove_base64_images is not None:
1499
+ scrape_params['removeBase64Images'] = remove_base64_images
1500
+ if block_ads is not None:
1501
+ scrape_params['blockAds'] = block_ads
1502
+ if proxy is not None:
1503
+ scrape_params['proxy'] = proxy
1504
+ if extract is not None:
1505
+ extract = self._ensure_schema_dict(extract)
1506
+ if isinstance(extract, dict) and "schema" in extract:
1507
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1508
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1509
+ if json_options is not None:
1510
+ json_options = self._ensure_schema_dict(json_options)
1511
+ if isinstance(json_options, dict) and "schema" in json_options:
1512
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1513
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1514
+ if actions is not None:
1515
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1516
+ if agent is not None:
1517
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1518
+ if max_concurrency is not None:
1519
+ scrape_params['maxConcurrency'] = max_concurrency
1520
+ if zero_data_retention is not None:
1521
+ scrape_params['zeroDataRetention'] = zero_data_retention
1522
+
1523
+ # Add any additional kwargs
1524
+ scrape_params.update(kwargs)
1525
+
1526
+ # Create final params object
1527
+ final_params = ScrapeParams(**scrape_params)
1528
+ params_dict = final_params.dict(exclude_none=True)
1529
+ params_dict['urls'] = urls
1530
+ params_dict['origin'] = f"python-sdk@{version}"
1531
+
1532
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1533
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1534
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1535
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1536
+
1537
+ # Make request
1538
+ headers = self._prepare_headers(idempotency_key)
1539
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1540
+
1541
+ if response.status_code == 200:
1542
+ try:
1543
+ return BatchScrapeResponse(**response.json())
1544
+ except:
1545
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1546
+ else:
1547
+ self._handle_error(response, 'start batch scrape job')
1548
+
1549
+ def batch_scrape_urls_and_watch(
1550
+ self,
1551
+ urls: List[str],
1552
+ *,
1553
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1554
+ headers: Optional[Dict[str, str]] = None,
1555
+ include_tags: Optional[List[str]] = None,
1556
+ exclude_tags: Optional[List[str]] = None,
1557
+ only_main_content: Optional[bool] = None,
1558
+ wait_for: Optional[int] = None,
1559
+ timeout: Optional[int] = None,
1560
+ location: Optional[LocationConfig] = None,
1561
+ mobile: Optional[bool] = None,
1562
+ skip_tls_verification: Optional[bool] = None,
1563
+ remove_base64_images: Optional[bool] = None,
1564
+ block_ads: Optional[bool] = None,
1565
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1566
+ extract: Optional[JsonConfig] = None,
1567
+ json_options: Optional[JsonConfig] = None,
1568
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1569
+ agent: Optional[AgentOptions] = None,
1570
+ max_concurrency: Optional[int] = None,
1571
+ zero_data_retention: Optional[bool] = None,
1572
+ idempotency_key: Optional[str] = None,
1573
+ **kwargs
1574
+ ) -> 'CrawlWatcher':
1575
+ """
1576
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1577
+
1578
+ Args:
1579
+ urls (List[str]): URLs to scrape
1580
+ formats (Optional[List[Literal]]): Content formats to retrieve
1581
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1582
+ include_tags (Optional[List[str]]): HTML tags to include
1583
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1584
+ only_main_content (Optional[bool]): Extract main content only
1585
+ wait_for (Optional[int]): Wait time in milliseconds
1586
+ timeout (Optional[int]): Request timeout in milliseconds
1587
+ location (Optional[LocationConfig]): Location configuration
1588
+ mobile (Optional[bool]): Use mobile user agent
1589
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1590
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1591
+ block_ads (Optional[bool]): Block advertisements
1592
+ proxy (Optional[Literal]): Proxy type to use
1593
+ extract (Optional[JsonConfig]): Content extraction config
1594
+ json_options (Optional[JsonConfig]): JSON extraction config
1595
+ actions (Optional[List[Union]]): Actions to perform
1596
+ agent (Optional[AgentOptions]): Agent configuration
1597
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1598
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1599
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1600
+ **kwargs: Additional parameters to pass to the API
1601
+
1602
+ Returns:
1603
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1604
+
1605
+ Raises:
1606
+ Exception: If batch scrape job fails to start
1607
+ """
1608
+ # Validate any additional kwargs
1609
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1610
+
1611
+ scrape_params = {}
1612
+
1613
+ # Add individual parameters
1614
+ if formats is not None:
1615
+ scrape_params['formats'] = formats
1616
+ if headers is not None:
1617
+ scrape_params['headers'] = headers
1618
+ if include_tags is not None:
1619
+ scrape_params['includeTags'] = include_tags
1620
+ if exclude_tags is not None:
1621
+ scrape_params['excludeTags'] = exclude_tags
1622
+ if only_main_content is not None:
1623
+ scrape_params['onlyMainContent'] = only_main_content
1624
+ if wait_for is not None:
1625
+ scrape_params['waitFor'] = wait_for
1626
+ if timeout is not None:
1627
+ scrape_params['timeout'] = timeout
1628
+ if location is not None:
1629
+ scrape_params['location'] = location.dict(exclude_none=True)
1630
+ if mobile is not None:
1631
+ scrape_params['mobile'] = mobile
1632
+ if skip_tls_verification is not None:
1633
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1634
+ if remove_base64_images is not None:
1635
+ scrape_params['removeBase64Images'] = remove_base64_images
1636
+ if block_ads is not None:
1637
+ scrape_params['blockAds'] = block_ads
1638
+ if proxy is not None:
1639
+ scrape_params['proxy'] = proxy
1640
+ if extract is not None:
1641
+ extract = self._ensure_schema_dict(extract)
1642
+ if isinstance(extract, dict) and "schema" in extract:
1643
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1644
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1645
+ if json_options is not None:
1646
+ json_options = self._ensure_schema_dict(json_options)
1647
+ if isinstance(json_options, dict) and "schema" in json_options:
1648
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1649
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1650
+ if actions is not None:
1651
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1652
+ if agent is not None:
1653
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1654
+ if max_concurrency is not None:
1655
+ scrape_params['maxConcurrency'] = max_concurrency
1656
+ if zero_data_retention is not None:
1657
+ scrape_params['zeroDataRetention'] = zero_data_retention
1658
+
1659
+ # Add any additional kwargs
1660
+ scrape_params.update(kwargs)
1661
+
1662
+ # Create final params object
1663
+ final_params = ScrapeParams(**scrape_params)
1664
+ params_dict = final_params.dict(exclude_none=True)
1665
+ params_dict['urls'] = urls
1666
+ params_dict['origin'] = f"python-sdk@{version}"
1667
+
1668
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1669
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1670
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1671
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1672
+
1673
+ # Make request
1674
+ headers = self._prepare_headers(idempotency_key)
1675
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1676
+
1677
+ if response.status_code == 200:
1678
+ try:
1679
+ crawl_response = BatchScrapeResponse(**response.json())
1680
+ if crawl_response.success and crawl_response.id:
1681
+ return CrawlWatcher(crawl_response.id, self)
1682
+ else:
1683
+ raise Exception("Batch scrape job failed to start")
1684
+ except:
1685
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1686
+ else:
1687
+ self._handle_error(response, 'start batch scrape job')
1688
+
1689
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1690
+ """
1691
+ Check the status of a batch scrape job using the Firecrawl API.
1692
+
1693
+ Args:
1694
+ id (str): The ID of the batch scrape job.
1695
+
1696
+ Returns:
1697
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1698
+
1699
+ Raises:
1700
+ Exception: If the status check request fails.
1701
+ """
1702
+ endpoint = f'/v1/batch/scrape/{id}'
1703
+
1704
+ headers = self._prepare_headers()
1705
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1706
+ if response.status_code == 200:
1707
+ try:
1708
+ status_data = response.json()
1709
+ except:
1710
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1711
+ if status_data['status'] == 'completed':
1712
+ if 'data' in status_data:
1713
+ data = status_data['data']
1714
+ while 'next' in status_data:
1715
+ if len(status_data['data']) == 0:
1716
+ break
1717
+ next_url = status_data.get('next')
1718
+ if not next_url:
1719
+ logger.warning("Expected 'next' URL is missing.")
1720
+ break
1721
+ try:
1722
+ status_response = self._get_request(next_url, headers)
1723
+ if status_response.status_code != 200:
1724
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1725
+ break
1726
+ try:
1727
+ next_data = status_response.json()
1728
+ except:
1729
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1730
+ data.extend(next_data.get('data', []))
1731
+ status_data = next_data
1732
+ except Exception as e:
1733
+ logger.error(f"Error during pagination request: {e}")
1734
+ break
1735
+ status_data['data'] = data
1736
+
1737
+ return BatchScrapeStatusResponse(**{
1738
+ 'success': False if 'error' in status_data else True,
1739
+ 'status': status_data.get('status'),
1740
+ 'total': status_data.get('total'),
1741
+ 'completed': status_data.get('completed'),
1742
+ 'creditsUsed': status_data.get('creditsUsed'),
1743
+ 'expiresAt': status_data.get('expiresAt'),
1744
+ 'data': status_data.get('data'),
1745
+ 'next': status_data.get('next'),
1746
+ 'error': status_data.get('error')
1747
+ })
1748
+ else:
1749
+ self._handle_error(response, 'check batch scrape status')
1750
+
1751
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1752
+ """
1753
+ Returns information about batch scrape errors.
1754
+
1755
+ Args:
1756
+ id (str): The ID of the crawl job.
1757
+
1758
+ Returns:
1759
+ CrawlErrorsResponse containing:
1760
+ * errors (List[Dict[str, str]]): List of errors with fields:
1761
+ * id (str): Error ID
1762
+ * timestamp (str): When the error occurred
1763
+ * url (str): URL that caused the error
1764
+ * error (str): Error message
1765
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1766
+
1767
+ Raises:
1768
+ Exception: If the error check request fails
1769
+ """
1770
+ headers = self._prepare_headers()
1771
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1772
+ if response.status_code == 200:
1773
+ try:
1774
+ return CrawlErrorsResponse(**response.json())
1775
+ except:
1776
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1777
+ else:
1778
+ self._handle_error(response, "check batch scrape errors")
1779
+
1780
+ def extract(
1781
+ self,
1782
+ urls: Optional[List[str]] = None,
1783
+ *,
1784
+ prompt: Optional[str] = None,
1785
+ schema: Optional[Any] = None,
1786
+ system_prompt: Optional[str] = None,
1787
+ allow_external_links: Optional[bool] = False,
1788
+ enable_web_search: Optional[bool] = False,
1789
+ show_sources: Optional[bool] = False,
1790
+ agent: Optional[Dict[str, Any]] = None,
1791
+ **kwargs) -> ExtractResponse[Any]:
1792
+ """
1793
+ Extract structured information from URLs.
1794
+
1795
+ Args:
1796
+ urls (Optional[List[str]]): URLs to extract from
1797
+ prompt (Optional[str]): Custom extraction prompt
1798
+ schema (Optional[Any]): JSON schema/Pydantic model
1799
+ system_prompt (Optional[str]): System context
1800
+ allow_external_links (Optional[bool]): Follow external links
1801
+ enable_web_search (Optional[bool]): Enable web search
1802
+ show_sources (Optional[bool]): Include source URLs
1803
+ agent (Optional[Dict[str, Any]]): Agent configuration
1804
+ **kwargs: Additional parameters to pass to the API
1805
+
1806
+ Returns:
1807
+ ExtractResponse[Any] with:
1808
+ * success (bool): Whether request succeeded
1809
+ * data (Optional[Any]): Extracted data matching schema
1810
+ * error (Optional[str]): Error message if any
1811
+
1812
+ Raises:
1813
+ ValueError: If prompt/schema missing or extraction fails
1814
+ """
1815
+ # Validate any additional kwargs
1816
+ self._validate_kwargs(kwargs, "extract")
1817
+
1818
+ headers = self._prepare_headers()
1819
+
1820
+ if not prompt and not schema:
1821
+ raise ValueError("Either prompt or schema is required")
1822
+
1823
+ if not urls and not prompt:
1824
+ raise ValueError("Either urls or prompt is required")
1825
+
1826
+ if schema:
1827
+ schema = self._ensure_schema_dict(schema)
1828
+
1829
+ request_data = {
1830
+ 'urls': urls or [],
1831
+ 'allowExternalLinks': allow_external_links,
1832
+ 'enableWebSearch': enable_web_search,
1833
+ 'showSources': show_sources,
1834
+ 'schema': schema,
1835
+ 'origin': f'python-sdk@{get_version()}'
1836
+ }
1837
+
1838
+ # Only add prompt and systemPrompt if they exist
1839
+ if prompt:
1840
+ request_data['prompt'] = prompt
1841
+ if system_prompt:
1842
+ request_data['systemPrompt'] = system_prompt
1843
+
1844
+ if agent:
1845
+ request_data['agent'] = agent
1846
+
1847
+ # Add any additional kwargs
1848
+ request_data.update(kwargs)
1849
+
1850
+ try:
1851
+ # Send the initial extract request
1852
+ response = self._post_request(
1853
+ f'{self.api_url}/v1/extract',
1854
+ request_data,
1855
+ headers
1856
+ )
1857
+ if response.status_code == 200:
1858
+ try:
1859
+ data = response.json()
1860
+ except:
1861
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1862
+ if data['success']:
1863
+ job_id = data.get('id')
1864
+ if not job_id:
1865
+ raise Exception('Job ID not returned from extract request.')
1866
+
1867
+ # Poll for the extract status
1868
+ while True:
1869
+ status_response = self._get_request(
1870
+ f'{self.api_url}/v1/extract/{job_id}',
1871
+ headers
1872
+ )
1873
+ if status_response.status_code == 200:
1874
+ try:
1875
+ status_data = status_response.json()
1876
+ except:
1877
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1878
+ if status_data['status'] == 'completed':
1879
+ return ExtractResponse(**status_data)
1880
+ elif status_data['status'] in ['failed', 'cancelled']:
1881
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1882
+ else:
1883
+ self._handle_error(status_response, "extract-status")
1884
+
1885
+ time.sleep(2) # Polling interval
1886
+ else:
1887
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1888
+ else:
1889
+ self._handle_error(response, "extract")
1890
+ except Exception as e:
1891
+ raise ValueError(str(e), 500)
1892
+
1893
+ return ExtractResponse(success=False, error="Internal server error.")
1894
+
1895
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1896
+ """
1897
+ Retrieve the status of an extract job.
1898
+
1899
+ Args:
1900
+ job_id (str): The ID of the extract job.
1901
+
1902
+ Returns:
1903
+ ExtractResponse[Any]: The status of the extract job.
1904
+
1905
+ Raises:
1906
+ ValueError: If there is an error retrieving the status.
1907
+ """
1908
+ headers = self._prepare_headers()
1909
+ try:
1910
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1911
+ if response.status_code == 200:
1912
+ try:
1913
+ return ExtractResponse(**response.json())
1914
+ except:
1915
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1916
+ else:
1917
+ self._handle_error(response, "get extract status")
1918
+ except Exception as e:
1919
+ raise ValueError(str(e), 500)
1920
+
1921
+ def async_extract(
1922
+ self,
1923
+ urls: Optional[List[str]] = None,
1924
+ *,
1925
+ prompt: Optional[str] = None,
1926
+ schema: Optional[Any] = None,
1927
+ system_prompt: Optional[str] = None,
1928
+ allow_external_links: Optional[bool] = False,
1929
+ enable_web_search: Optional[bool] = False,
1930
+ show_sources: Optional[bool] = False,
1931
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1932
+ """
1933
+ Initiate an asynchronous extract job.
1934
+
1935
+ Args:
1936
+ urls (List[str]): URLs to extract information from
1937
+ prompt (Optional[str]): Custom extraction prompt
1938
+ schema (Optional[Any]): JSON schema/Pydantic model
1939
+ system_prompt (Optional[str]): System context
1940
+ allow_external_links (Optional[bool]): Follow external links
1941
+ enable_web_search (Optional[bool]): Enable web search
1942
+ show_sources (Optional[bool]): Include source URLs
1943
+ agent (Optional[Dict[str, Any]]): Agent configuration
1944
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1945
+
1946
+ Returns:
1947
+ ExtractResponse[Any] with:
1948
+ * success (bool): Whether request succeeded
1949
+ * data (Optional[Any]): Extracted data matching schema
1950
+ * error (Optional[str]): Error message if any
1951
+
1952
+ Raises:
1953
+ ValueError: If job initiation fails
1954
+ """
1955
+ headers = self._prepare_headers()
1956
+
1957
+ schema = schema
1958
+ if schema:
1959
+ schema = self._ensure_schema_dict(schema)
1960
+
1961
+ request_data = {
1962
+ 'urls': urls,
1963
+ 'allowExternalLinks': allow_external_links,
1964
+ 'enableWebSearch': enable_web_search,
1965
+ 'showSources': show_sources,
1966
+ 'schema': schema,
1967
+ 'origin': f'python-sdk@{version}'
1968
+ }
1969
+
1970
+ if prompt:
1971
+ request_data['prompt'] = prompt
1972
+ if system_prompt:
1973
+ request_data['systemPrompt'] = system_prompt
1974
+ if agent:
1975
+ request_data['agent'] = agent
1976
+
1977
+ try:
1978
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1979
+ if response.status_code == 200:
1980
+ try:
1981
+ return ExtractResponse(**response.json())
1982
+ except:
1983
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1984
+ else:
1985
+ self._handle_error(response, "async extract")
1986
+ except Exception as e:
1987
+ raise ValueError(str(e), 500)
1988
+
1989
+ def generate_llms_text(
1990
+ self,
1991
+ url: str,
1992
+ *,
1993
+ max_urls: Optional[int] = None,
1994
+ show_full_text: Optional[bool] = None,
1995
+ cache: Optional[bool] = None,
1996
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1997
+ """
1998
+ Generate LLMs.txt for a given URL and poll until completion.
1999
+
2000
+ Args:
2001
+ url (str): Target URL to generate LLMs.txt from
2002
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2003
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2004
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2005
+ experimental_stream (Optional[bool]): Enable experimental streaming
2006
+
2007
+ Returns:
2008
+ GenerateLLMsTextStatusResponse with:
2009
+ * Generated LLMs.txt content
2010
+ * Full version if requested
2011
+ * Generation status
2012
+ * Success/error information
2013
+
2014
+ Raises:
2015
+ Exception: If generation fails
2016
+ """
2017
+ params = GenerateLLMsTextParams(
2018
+ maxUrls=max_urls,
2019
+ showFullText=show_full_text,
2020
+ cache=cache,
2021
+ __experimental_stream=experimental_stream
2022
+ )
2023
+
2024
+ response = self.async_generate_llms_text(
2025
+ url,
2026
+ max_urls=max_urls,
2027
+ show_full_text=show_full_text,
2028
+ cache=cache,
2029
+ experimental_stream=experimental_stream
2030
+ )
2031
+
2032
+ if not response.success or not response.id:
2033
+ return GenerateLLMsTextStatusResponse(
2034
+ success=False,
2035
+ error='Failed to start LLMs.txt generation',
2036
+ status='failed',
2037
+ expiresAt=''
2038
+ )
2039
+
2040
+ job_id = response.id
2041
+ while True:
2042
+ status = self.check_generate_llms_text_status(job_id)
2043
+
2044
+ if status.status == 'completed':
2045
+ return status
2046
+ elif status.status == 'failed':
2047
+ return status
2048
+ elif status.status != 'processing':
2049
+ return GenerateLLMsTextStatusResponse(
2050
+ success=False,
2051
+ error='LLMs.txt generation job terminated unexpectedly',
2052
+ status='failed',
2053
+ expiresAt=''
2054
+ )
2055
+
2056
+ time.sleep(2) # Polling interval
2057
+
2058
+ def async_generate_llms_text(
2059
+ self,
2060
+ url: str,
2061
+ *,
2062
+ max_urls: Optional[int] = None,
2063
+ show_full_text: Optional[bool] = None,
2064
+ cache: Optional[bool] = None,
2065
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
2066
+ """
2067
+ Initiate an asynchronous LLMs.txt generation operation.
2068
+
2069
+ Args:
2070
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2071
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2072
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2073
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2074
+ experimental_stream (Optional[bool]): Enable experimental streaming
2075
+
2076
+ Returns:
2077
+ GenerateLLMsTextResponse: A response containing:
2078
+ * success (bool): Whether the generation initiation was successful
2079
+ * id (str): The unique identifier for the generation job
2080
+ * error (str, optional): Error message if initiation failed
2081
+
2082
+ Raises:
2083
+ Exception: If the generation job initiation fails.
2084
+ """
2085
+ params = GenerateLLMsTextParams(
2086
+ maxUrls=max_urls,
2087
+ showFullText=show_full_text,
2088
+ cache=cache,
2089
+ __experimental_stream=experimental_stream
2090
+ )
2091
+
2092
+ headers = self._prepare_headers()
2093
+ json_data = {'url': url, **params.dict(exclude_none=True)}
2094
+ json_data['origin'] = f"python-sdk@{version}"
2095
+
2096
+ try:
2097
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2098
+ response = req.json()
2099
+ print("json_data", json_data)
2100
+ print("response", response)
2101
+ if response.get('success'):
2102
+ try:
2103
+ return GenerateLLMsTextResponse(**response)
2104
+ except:
2105
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2106
+ else:
2107
+ self._handle_error(response, 'start LLMs.txt generation')
2108
+ except Exception as e:
2109
+ raise ValueError(str(e))
2110
+
2111
+ return GenerateLLMsTextResponse(
2112
+ success=False,
2113
+ error='Internal server error'
2114
+ )
2115
+
2116
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2117
+ """
2118
+ Check the status of a LLMs.txt generation operation.
2119
+
2120
+ Args:
2121
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
2122
+
2123
+ Returns:
2124
+ GenerateLLMsTextStatusResponse: A response containing:
2125
+ * success (bool): Whether the generation was successful
2126
+ * status (str): Status of generation ("processing", "completed", "failed")
2127
+ * data (Dict[str, str], optional): Generated text with fields:
2128
+ * llmstxt (str): Generated LLMs.txt content
2129
+ * llmsfulltxt (str, optional): Full version if requested
2130
+ * error (str, optional): Error message if generation failed
2131
+ * expiresAt (str): When the generated data expires
2132
+
2133
+ Raises:
2134
+ Exception: If the status check fails.
2135
+ """
2136
+ headers = self._prepare_headers()
2137
+ try:
2138
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2139
+ if response.status_code == 200:
2140
+ try:
2141
+ json_data = response.json()
2142
+ return GenerateLLMsTextStatusResponse(**json_data)
2143
+ except Exception as e:
2144
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2145
+ elif response.status_code == 404:
2146
+ raise Exception('LLMs.txt generation job not found')
2147
+ else:
2148
+ self._handle_error(response, 'check LLMs.txt generation status')
2149
+ except Exception as e:
2150
+ raise ValueError(str(e))
2151
+
2152
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2153
+
2154
+ def _prepare_headers(
2155
+ self,
2156
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
2157
+ """
2158
+ Prepare the headers for API requests.
2159
+
2160
+ Args:
2161
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2162
+
2163
+ Returns:
2164
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2165
+ """
2166
+ if idempotency_key:
2167
+ return {
2168
+ 'Content-Type': 'application/json',
2169
+ 'Authorization': f'Bearer {self.api_key}',
2170
+ 'x-idempotency-key': idempotency_key
2171
+ }
2172
+
2173
+ return {
2174
+ 'Content-Type': 'application/json',
2175
+ 'Authorization': f'Bearer {self.api_key}',
2176
+ }
2177
+
2178
+ def _post_request(
2179
+ self,
2180
+ url: str,
2181
+ data: Dict[str, Any],
2182
+ headers: Dict[str, str],
2183
+ retries: int = 3,
2184
+ backoff_factor: float = 0.5) -> requests.Response:
2185
+ """
2186
+ Make a POST request with retries.
2187
+
2188
+ Args:
2189
+ url (str): The URL to send the POST request to.
2190
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2191
+ headers (Dict[str, str]): The headers to include in the POST request.
2192
+ retries (int): Number of retries for the request.
2193
+ backoff_factor (float): Backoff factor for retries.
2194
+
2195
+ Returns:
2196
+ requests.Response: The response from the POST request.
2197
+
2198
+ Raises:
2199
+ requests.RequestException: If the request fails after the specified retries.
2200
+ """
2201
+ for attempt in range(retries):
2202
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2203
+ if response.status_code == 502:
2204
+ time.sleep(backoff_factor * (2 ** attempt))
2205
+ else:
2206
+ return response
2207
+ return response
2208
+
2209
+ def _get_request(
2210
+ self,
2211
+ url: str,
2212
+ headers: Dict[str, str],
2213
+ retries: int = 3,
2214
+ backoff_factor: float = 0.5) -> requests.Response:
2215
+ """
2216
+ Make a GET request with retries.
2217
+
2218
+ Args:
2219
+ url (str): The URL to send the GET request to.
2220
+ headers (Dict[str, str]): The headers to include in the GET request.
2221
+ retries (int): Number of retries for the request.
2222
+ backoff_factor (float): Backoff factor for retries.
2223
+
2224
+ Returns:
2225
+ requests.Response: The response from the GET request.
2226
+
2227
+ Raises:
2228
+ requests.RequestException: If the request fails after the specified retries.
2229
+ """
2230
+ for attempt in range(retries):
2231
+ response = requests.get(url, headers=headers)
2232
+ if response.status_code == 502:
2233
+ time.sleep(backoff_factor * (2 ** attempt))
2234
+ else:
2235
+ return response
2236
+ return response
2237
+
2238
+ def _delete_request(
2239
+ self,
2240
+ url: str,
2241
+ headers: Dict[str, str],
2242
+ retries: int = 3,
2243
+ backoff_factor: float = 0.5) -> requests.Response:
2244
+ """
2245
+ Make a DELETE request with retries.
2246
+
2247
+ Args:
2248
+ url (str): The URL to send the DELETE request to.
2249
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2250
+ retries (int): Number of retries for the request.
2251
+ backoff_factor (float): Backoff factor for retries.
2252
+
2253
+ Returns:
2254
+ requests.Response: The response from the DELETE request.
2255
+
2256
+ Raises:
2257
+ requests.RequestException: If the request fails after the specified retries.
2258
+ """
2259
+ for attempt in range(retries):
2260
+ response = requests.delete(url, headers=headers)
2261
+ if response.status_code == 502:
2262
+ time.sleep(backoff_factor * (2 ** attempt))
2263
+ else:
2264
+ return response
2265
+ return response
2266
+
2267
+ def _monitor_job_status(
2268
+ self,
2269
+ id: str,
2270
+ headers: Dict[str, str],
2271
+ poll_interval: int) -> CrawlStatusResponse:
2272
+ """
2273
+ Monitor the status of a crawl job until completion.
2274
+
2275
+ Args:
2276
+ id (str): The ID of the crawl job.
2277
+ headers (Dict[str, str]): The headers to include in the status check requests.
2278
+ poll_interval (int): Seconds between status checks.
2279
+
2280
+ Returns:
2281
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2282
+
2283
+ Raises:
2284
+ Exception: If the job fails or an error occurs during status checks.
2285
+ """
2286
+ while True:
2287
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2288
+
2289
+ status_response = self._get_request(api_url, headers)
2290
+ if status_response.status_code == 200:
2291
+ try:
2292
+ status_data = status_response.json()
2293
+ except:
2294
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2295
+ if status_data['status'] == 'completed':
2296
+ if 'data' in status_data:
2297
+ data = status_data['data']
2298
+ while 'next' in status_data:
2299
+ if len(status_data['data']) == 0:
2300
+ break
2301
+ status_response = self._get_request(status_data['next'], headers)
2302
+ try:
2303
+ status_data = status_response.json()
2304
+ except:
2305
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2306
+ data.extend(status_data.get('data', []))
2307
+ status_data['data'] = data
2308
+ return CrawlStatusResponse(**status_data)
2309
+ else:
2310
+ raise Exception('Crawl job completed but no data was returned')
2311
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2312
+ poll_interval=max(poll_interval,2)
2313
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2314
+ else:
2315
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2316
+ else:
2317
+ self._handle_error(status_response, 'check crawl status')
2318
+
2319
+ def _handle_error(
2320
+ self,
2321
+ response: requests.Response,
2322
+ action: str) -> None:
2323
+ """
2324
+ Handle errors from API responses.
2325
+
2326
+ Args:
2327
+ response (requests.Response): The response object from the API request.
2328
+ action (str): Description of the action that was being performed.
2329
+
2330
+ Raises:
2331
+ Exception: An exception with a message containing the status code and error details from the response.
2332
+ """
2333
+ try:
2334
+ error_message = response.json().get('error', 'No error message provided.')
2335
+ error_details = response.json().get('details', 'No additional error details provided.')
2336
+ except:
2337
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2338
+
2339
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2340
+
2341
+ # Raise an HTTPError with the custom message and attach the response
2342
+ raise requests.exceptions.HTTPError(message, response=response)
2343
+
2344
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2345
+ """
2346
+ Generate a standardized error message based on HTTP status code.
2347
+
2348
+ Args:
2349
+ status_code (int): The HTTP status code from the response
2350
+ action (str): Description of the action that was being performed
2351
+ error_message (str): The error message from the API response
2352
+ error_details (str): Additional error details from the API response
2353
+
2354
+ Returns:
2355
+ str: A formatted error message
2356
+ """
2357
+ if status_code == 402:
2358
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2359
+ elif status_code == 403:
2360
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2361
+ elif status_code == 408:
2362
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2363
+ elif status_code == 409:
2364
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2365
+ elif status_code == 500:
2366
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2367
+ else:
2368
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2369
+
2370
+ def deep_research(
2371
+ self,
2372
+ query: str,
2373
+ *,
2374
+ max_depth: Optional[int] = None,
2375
+ time_limit: Optional[int] = None,
2376
+ max_urls: Optional[int] = None,
2377
+ analysis_prompt: Optional[str] = None,
2378
+ system_prompt: Optional[str] = None,
2379
+ __experimental_stream_steps: Optional[bool] = None,
2380
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2381
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2382
+ """
2383
+ Initiates a deep research operation on a given query and polls until completion.
2384
+
2385
+ Args:
2386
+ query (str): Research query or topic to investigate
2387
+ max_depth (Optional[int]): Maximum depth of research exploration
2388
+ time_limit (Optional[int]): Time limit in seconds for research
2389
+ max_urls (Optional[int]): Maximum number of URLs to process
2390
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2391
+ system_prompt (Optional[str]): Custom system prompt
2392
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2393
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2394
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2395
+
2396
+ Returns:
2397
+ DeepResearchStatusResponse containing:
2398
+ * success (bool): Whether research completed successfully
2399
+ * status (str): Current state (processing/completed/failed)
2400
+ * error (Optional[str]): Error message if failed
2401
+ * id (str): Unique identifier for the research job
2402
+ * data (Any): Research findings and analysis
2403
+ * sources (List[Dict]): List of discovered sources
2404
+ * activities (List[Dict]): Research progress log
2405
+ * summaries (List[str]): Generated research summaries
2406
+
2407
+ Raises:
2408
+ Exception: If research fails
2409
+ """
2410
+ research_params = {}
2411
+ if max_depth is not None:
2412
+ research_params['maxDepth'] = max_depth
2413
+ if time_limit is not None:
2414
+ research_params['timeLimit'] = time_limit
2415
+ if max_urls is not None:
2416
+ research_params['maxUrls'] = max_urls
2417
+ if analysis_prompt is not None:
2418
+ research_params['analysisPrompt'] = analysis_prompt
2419
+ if system_prompt is not None:
2420
+ research_params['systemPrompt'] = system_prompt
2421
+ if __experimental_stream_steps is not None:
2422
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2423
+ research_params = DeepResearchParams(**research_params)
2424
+
2425
+ response = self.async_deep_research(
2426
+ query,
2427
+ max_depth=max_depth,
2428
+ time_limit=time_limit,
2429
+ max_urls=max_urls,
2430
+ analysis_prompt=analysis_prompt,
2431
+ system_prompt=system_prompt
2432
+ )
2433
+ if not response.get('success') or 'id' not in response:
2434
+ return response
2435
+
2436
+ job_id = response['id']
2437
+ last_activity_count = 0
2438
+ last_source_count = 0
2439
+
2440
+ while True:
2441
+ status = self.check_deep_research_status(job_id)
2442
+
2443
+ if on_activity and 'activities' in status:
2444
+ new_activities = status['activities'][last_activity_count:]
2445
+ for activity in new_activities:
2446
+ on_activity(activity)
2447
+ last_activity_count = len(status['activities'])
2448
+
2449
+ if on_source and 'sources' in status:
2450
+ new_sources = status['sources'][last_source_count:]
2451
+ for source in new_sources:
2452
+ on_source(source)
2453
+ last_source_count = len(status['sources'])
2454
+
2455
+ if status['status'] == 'completed':
2456
+ return status
2457
+ elif status['status'] == 'failed':
2458
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2459
+ elif status['status'] != 'processing':
2460
+ break
2461
+
2462
+ time.sleep(2) # Polling interval
2463
+
2464
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2465
+
2466
+ def async_deep_research(
2467
+ self,
2468
+ query: str,
2469
+ *,
2470
+ max_depth: Optional[int] = None,
2471
+ time_limit: Optional[int] = None,
2472
+ max_urls: Optional[int] = None,
2473
+ analysis_prompt: Optional[str] = None,
2474
+ system_prompt: Optional[str] = None,
2475
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2476
+ """
2477
+ Initiates an asynchronous deep research operation.
2478
+
2479
+ Args:
2480
+ query (str): Research query or topic to investigate
2481
+ max_depth (Optional[int]): Maximum depth of research exploration
2482
+ time_limit (Optional[int]): Time limit in seconds for research
2483
+ max_urls (Optional[int]): Maximum number of URLs to process
2484
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2485
+ system_prompt (Optional[str]): Custom system prompt
2486
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2487
+
2488
+ Returns:
2489
+ Dict[str, Any]: A response containing:
2490
+ * success (bool): Whether the research initiation was successful
2491
+ * id (str): The unique identifier for the research job
2492
+ * error (str, optional): Error message if initiation failed
2493
+
2494
+ Raises:
2495
+ Exception: If the research initiation fails.
2496
+ """
2497
+ research_params = {}
2498
+ if max_depth is not None:
2499
+ research_params['maxDepth'] = max_depth
2500
+ if time_limit is not None:
2501
+ research_params['timeLimit'] = time_limit
2502
+ if max_urls is not None:
2503
+ research_params['maxUrls'] = max_urls
2504
+ if analysis_prompt is not None:
2505
+ research_params['analysisPrompt'] = analysis_prompt
2506
+ if system_prompt is not None:
2507
+ research_params['systemPrompt'] = system_prompt
2508
+ if __experimental_stream_steps is not None:
2509
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2510
+ research_params = DeepResearchParams(**research_params)
2511
+
2512
+ headers = self._prepare_headers()
2513
+
2514
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2515
+ json_data['origin'] = f"python-sdk@{version}"
2516
+
2517
+ # Handle json options schema if present
2518
+ if 'jsonOptions' in json_data:
2519
+ json_opts = json_data['jsonOptions']
2520
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2521
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2522
+
2523
+ try:
2524
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2525
+ if response.status_code == 200:
2526
+ try:
2527
+ return response.json()
2528
+ except:
2529
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2530
+ else:
2531
+ self._handle_error(response, 'start deep research')
2532
+ except Exception as e:
2533
+ raise ValueError(str(e))
2534
+
2535
+ return {'success': False, 'error': 'Internal server error'}
2536
+
2537
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2538
+ """
2539
+ Check the status of a deep research operation.
2540
+
2541
+ Args:
2542
+ id (str): The ID of the deep research operation.
2543
+
2544
+ Returns:
2545
+ DeepResearchResponse containing:
2546
+
2547
+ Status:
2548
+ * success - Whether research completed successfully
2549
+ * status - Current state (processing/completed/failed)
2550
+ * error - Error message if failed
2551
+
2552
+ Results:
2553
+ * id - Unique identifier for the research job
2554
+ * data - Research findings and analysis
2555
+ * sources - List of discovered sources
2556
+ * activities - Research progress log
2557
+ * summaries - Generated research summaries
2558
+
2559
+ Raises:
2560
+ Exception: If the status check fails.
2561
+ """
2562
+ headers = self._prepare_headers()
2563
+ try:
2564
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2565
+ if response.status_code == 200:
2566
+ try:
2567
+ return response.json()
2568
+ except:
2569
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2570
+ elif response.status_code == 404:
2571
+ raise Exception('Deep research job not found')
2572
+ else:
2573
+ self._handle_error(response, 'check deep research status')
2574
+ except Exception as e:
2575
+ raise ValueError(str(e))
2576
+
2577
+ return {'success': False, 'error': 'Internal server error'}
2578
+
2579
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2580
+ """
2581
+ Validate additional keyword arguments before they are passed to the API.
2582
+ This provides early validation before the Pydantic model validation.
2583
+
2584
+ Args:
2585
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2586
+ method_name (str): Name of the method these kwargs are for
2587
+
2588
+ Raises:
2589
+ ValueError: If kwargs contain invalid or unsupported parameters
2590
+ """
2591
+ if not kwargs:
2592
+ return
2593
+
2594
+ # Known parameter mappings for each method
2595
+ method_params = {
2596
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2597
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2598
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "integration"},
2599
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
2600
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2601
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2602
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url", "integration"},
2603
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout", "integration"},
2604
+ "extract": {"prompt", "schema", "system_prompt", "allow_external_links", "enable_web_search", "show_sources", "agent", "integration"},
2605
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2606
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2607
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2608
+ "actions", "agent", "webhook"},
2609
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2610
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2611
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2612
+ "actions", "agent", "webhook"},
2613
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2614
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2615
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2616
+ "actions", "agent", "webhook"}
2617
+ }
2618
+
2619
+ # Get allowed parameters for this method
2620
+ allowed_params = method_params.get(method_name, set())
2621
+
2622
+ # Check for unknown parameters
2623
+ unknown_params = set(kwargs.keys()) - allowed_params
2624
+ if unknown_params:
2625
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2626
+
2627
+ # Additional type validation can be added here if needed
2628
+ # For now, we rely on Pydantic models for detailed type validation
2629
+
2630
+ def _ensure_schema_dict(self, schema):
2631
+ """
2632
+ Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2633
+ """
2634
+ if schema is None:
2635
+ return schema
2636
+ if isinstance(schema, type):
2637
+ # Pydantic v1/v2 model class
2638
+ if hasattr(schema, 'model_json_schema'):
2639
+ return schema.model_json_schema()
2640
+ elif hasattr(schema, 'schema'):
2641
+ return schema.schema()
2642
+ if isinstance(schema, dict):
2643
+ return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2644
+ if isinstance(schema, (list, tuple)):
2645
+ return [self._ensure_schema_dict(v) for v in schema]
2646
+ return schema
2647
+
2648
+ class CrawlWatcher:
2649
+ """
2650
+ A class to watch and handle crawl job events via WebSocket connection.
2651
+
2652
+ Attributes:
2653
+ id (str): The ID of the crawl job to watch
2654
+ app (FirecrawlApp): The FirecrawlApp instance
2655
+ data (List[Dict[str, Any]]): List of crawled documents/data
2656
+ status (str): Current status of the crawl job
2657
+ ws_url (str): WebSocket URL for the crawl job
2658
+ event_handlers (dict): Dictionary of event type to list of handler functions
2659
+ """
2660
+ def __init__(self, id: str, app: FirecrawlApp):
2661
+ self.id = id
2662
+ self.app = app
2663
+ self.data: List[Dict[str, Any]] = []
2664
+ self.status = "scraping"
2665
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2666
+ self.event_handlers = {
2667
+ 'done': [],
2668
+ 'error': [],
2669
+ 'document': []
2670
+ }
2671
+
2672
+ async def connect(self) -> None:
2673
+ """
2674
+ Establishes WebSocket connection and starts listening for messages.
2675
+ """
2676
+ async with websockets.connect(
2677
+ self.ws_url,
2678
+ max_size=None,
2679
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2680
+ ) as websocket:
2681
+ await self._listen(websocket)
2682
+
2683
+ async def _listen(self, websocket) -> None:
2684
+ """
2685
+ Listens for incoming WebSocket messages and handles them.
2686
+
2687
+ Args:
2688
+ websocket: The WebSocket connection object
2689
+ """
2690
+ async for message in websocket:
2691
+ msg = json.loads(message)
2692
+ await self._handle_message(msg)
2693
+
2694
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2695
+ """
2696
+ Adds an event handler function for a specific event type.
2697
+
2698
+ Args:
2699
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2700
+ handler (Callable): Function to handle the event
2701
+ """
2702
+ if event_type in self.event_handlers:
2703
+ self.event_handlers[event_type].append(handler)
2704
+
2705
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2706
+ """
2707
+ Dispatches an event to all registered handlers for that event type.
2708
+
2709
+ Args:
2710
+ event_type (str): Type of event to dispatch
2711
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2712
+ """
2713
+ if event_type in self.event_handlers:
2714
+ for handler in self.event_handlers[event_type]:
2715
+ handler(detail)
2716
+
2717
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2718
+ """
2719
+ Handles incoming WebSocket messages based on their type.
2720
+
2721
+ Args:
2722
+ msg (Dict[str, Any]): The message to handle
2723
+ """
2724
+ if msg['type'] == 'done':
2725
+ self.status = 'completed'
2726
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2727
+ elif msg['type'] == 'error':
2728
+ self.status = 'failed'
2729
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2730
+ elif msg['type'] == 'catchup':
2731
+ self.status = msg['data']['status']
2732
+ self.data.extend(msg['data'].get('data', []))
2733
+ for doc in self.data:
2734
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2735
+ elif msg['type'] == 'document':
2736
+ self.data.append(msg['data'])
2737
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2738
+
2739
+ class AsyncFirecrawlApp(FirecrawlApp):
2740
+ """
2741
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2742
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2743
+ """
2744
+
2745
+ async def _async_request(
2746
+ self,
2747
+ method: str,
2748
+ url: str,
2749
+ headers: Dict[str, str],
2750
+ data: Optional[Dict[str, Any]] = None,
2751
+ retries: int = 3,
2752
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2753
+ """
2754
+ Generic async request method with exponential backoff retry logic.
2755
+
2756
+ Args:
2757
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2758
+ url (str): The URL to send the request to.
2759
+ headers (Dict[str, str]): Headers to include in the request.
2760
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2761
+ retries (int): Maximum number of retry attempts (default: 3).
2762
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2763
+ Delay will be backoff_factor * (2 ** retry_count).
2764
+
2765
+ Returns:
2766
+ Dict[str, Any]: The parsed JSON response from the server.
2767
+
2768
+ Raises:
2769
+ aiohttp.ClientError: If the request fails after all retries.
2770
+ Exception: If max retries are exceeded or other errors occur.
2771
+ """
2772
+ ssl_context = ssl.create_default_context(cafile=certifi.where())
2773
+ async with aiohttp.ClientSession(ssl=ssl_context) as session:
2774
+ for attempt in range(retries):
2775
+ try:
2776
+ async with session.request(
2777
+ method=method, url=url, headers=headers, json=data
2778
+ ) as response:
2779
+ if response.status == 502:
2780
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2781
+ continue
2782
+ if response.status >= 300:
2783
+ await self._handle_error(response, f"make {method} request")
2784
+ return await response.json()
2785
+ except aiohttp.ClientError as e:
2786
+ if attempt == retries - 1:
2787
+ raise e
2788
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2789
+ raise Exception("Max retries exceeded")
2790
+
2791
+ async def _async_post_request(
2792
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2793
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2794
+ """
2795
+ Make an async POST request with exponential backoff retry logic.
2796
+
2797
+ Args:
2798
+ url (str): The URL to send the POST request to.
2799
+ data (Dict[str, Any]): The JSON data to include in the request body.
2800
+ headers (Dict[str, str]): Headers to include in the request.
2801
+ retries (int): Maximum number of retry attempts (default: 3).
2802
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2803
+ Delay will be backoff_factor * (2 ** retry_count).
2804
+
2805
+ Returns:
2806
+ Dict[str, Any]: The parsed JSON response from the server.
2807
+
2808
+ Raises:
2809
+ aiohttp.ClientError: If the request fails after all retries.
2810
+ Exception: If max retries are exceeded or other errors occur.
2811
+ """
2812
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2813
+
2814
+ async def _async_get_request(
2815
+ self, url: str, headers: Dict[str, str],
2816
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2817
+ """
2818
+ Make an async GET request with exponential backoff retry logic.
2819
+
2820
+ Args:
2821
+ url (str): The URL to send the GET request to.
2822
+ headers (Dict[str, str]): Headers to include in the request.
2823
+ retries (int): Maximum number of retry attempts (default: 3).
2824
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2825
+ Delay will be backoff_factor * (2 ** retry_count).
2826
+
2827
+ Returns:
2828
+ Dict[str, Any]: The parsed JSON response from the server.
2829
+
2830
+ Raises:
2831
+ aiohttp.ClientError: If the request fails after all retries.
2832
+ Exception: If max retries are exceeded or other errors occur.
2833
+ """
2834
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2835
+
2836
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2837
+ """
2838
+ Handle errors from async API responses with detailed error messages.
2839
+
2840
+ Args:
2841
+ response (aiohttp.ClientResponse): The response object from the failed request
2842
+ action (str): Description of the action that was being attempted
2843
+
2844
+ Raises:
2845
+ aiohttp.ClientError: With a detailed error message based on the response status:
2846
+ - 402: Payment Required
2847
+ - 408: Request Timeout
2848
+ - 409: Conflict
2849
+ - 500: Internal Server Error
2850
+ - Other: Unexpected error with status code
2851
+ """
2852
+ try:
2853
+ error_data = await response.json()
2854
+ error_message = error_data.get('error', 'No error message provided.')
2855
+ error_details = error_data.get('details', 'No additional error details provided.')
2856
+ except:
2857
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2858
+
2859
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2860
+
2861
+ raise aiohttp.ClientError(message)
2862
+
2863
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2864
+ """
2865
+ Generate a standardized error message based on HTTP status code for async operations.
2866
+
2867
+ Args:
2868
+ status_code (int): The HTTP status code from the response
2869
+ action (str): Description of the action that was being performed
2870
+ error_message (str): The error message from the API response
2871
+ error_details (str): Additional error details from the API response
2872
+
2873
+ Returns:
2874
+ str: A formatted error message
2875
+ """
2876
+ return self._get_error_message(status_code, action, error_message, error_details)
2877
+
2878
+ async def crawl_url_and_watch(
2879
+ self,
2880
+ url: str,
2881
+ params: Optional[CrawlParams] = None,
2882
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2883
+ """
2884
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2885
+
2886
+ Args:
2887
+ url (str): Target URL to start crawling from
2888
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2889
+ URL Discovery:
2890
+ * includePaths - Patterns of URLs to include
2891
+ * excludePaths - Patterns of URLs to exclude
2892
+ * maxDepth - Maximum crawl depth
2893
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2894
+ * limit - Maximum pages to crawl
2895
+
2896
+ Link Following:
2897
+ * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2898
+ * crawlEntireDomain - Follow parent directory links
2899
+ * allowExternalLinks - Follow external domain links
2900
+ * ignoreSitemap - Skip sitemap.xml processing
2901
+
2902
+ Advanced:
2903
+ * scrapeOptions - Page scraping configuration
2904
+ * webhook - Notification webhook settings
2905
+ * deduplicateSimilarURLs - Remove similar URLs
2906
+ * ignoreQueryParameters - Ignore URL parameters
2907
+ * regexOnFullURL - Apply regex to full URLs
2908
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2909
+
2910
+ Returns:
2911
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2912
+
2913
+ Raises:
2914
+ Exception: If crawl job fails to start
2915
+ """
2916
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2917
+ if crawl_response.get('success') and 'id' in crawl_response:
2918
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2919
+ else:
2920
+ raise Exception("Crawl job failed to start")
2921
+
2922
+ async def batch_scrape_urls_and_watch(
2923
+ self,
2924
+ urls: List[str],
2925
+ params: Optional[ScrapeParams] = None,
2926
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2927
+ """
2928
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2929
+
2930
+ Args:
2931
+ urls (List[str]): List of URLs to scrape
2932
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2933
+
2934
+ Content Options:
2935
+ * formats - Content formats to retrieve
2936
+ * includeTags - HTML tags to include
2937
+ * excludeTags - HTML tags to exclude
2938
+ * onlyMainContent - Extract main content only
2939
+
2940
+ Request Options:
2941
+ * headers - Custom HTTP headers
2942
+ * timeout - Request timeout (ms)
2943
+ * mobile - Use mobile user agent
2944
+ * proxy - Proxy type
2945
+
2946
+ Extraction Options:
2947
+ * extract - Content extraction config
2948
+ * jsonOptions - JSON extraction config
2949
+ * actions - Actions to perform
2950
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2951
+
2952
+ Returns:
2953
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2954
+
2955
+ Raises:
2956
+ Exception: If batch scrape job fails to start
2957
+ """
2958
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2959
+ if batch_response.get('success') and 'id' in batch_response:
2960
+ return AsyncCrawlWatcher(batch_response['id'], self)
2961
+ else:
2962
+ raise Exception("Batch scrape job failed to start")
2963
+
2964
+ async def scrape_url(
2965
+ self,
2966
+ url: str,
2967
+ *,
2968
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2969
+ include_tags: Optional[List[str]] = None,
2970
+ exclude_tags: Optional[List[str]] = None,
2971
+ only_main_content: Optional[bool] = None,
2972
+ wait_for: Optional[int] = None,
2973
+ timeout: Optional[int] = None,
2974
+ location: Optional[LocationConfig] = None,
2975
+ mobile: Optional[bool] = None,
2976
+ skip_tls_verification: Optional[bool] = None,
2977
+ remove_base64_images: Optional[bool] = None,
2978
+ block_ads: Optional[bool] = None,
2979
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2980
+ parse_pdf: Optional[bool] = None,
2981
+ extract: Optional[JsonConfig] = None,
2982
+ json_options: Optional[JsonConfig] = None,
2983
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
2984
+ **kwargs) -> ScrapeResponse[Any]:
2985
+ """
2986
+ Scrape a single URL asynchronously.
2987
+
2988
+ Args:
2989
+ url (str): Target URL to scrape
2990
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2991
+ include_tags (Optional[List[str]]): HTML tags to include
2992
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2993
+ only_main_content (Optional[bool]): Extract main content only
2994
+ wait_for (Optional[int]): Wait for a specific element to appear
2995
+ timeout (Optional[int]): Request timeout (ms)
2996
+ location (Optional[LocationConfig]): Location configuration
2997
+ mobile (Optional[bool]): Use mobile user agent
2998
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2999
+ remove_base64_images (Optional[bool]): Remove base64 images
3000
+ block_ads (Optional[bool]): Block ads
3001
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
3002
+ extract (Optional[JsonConfig]): Content extraction settings
3003
+ json_options (Optional[JsonConfig]): JSON extraction settings
3004
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
3005
+ **kwargs: Additional parameters to pass to the API
3006
+
3007
+ Returns:
3008
+ ScrapeResponse with:
3009
+ * success - Whether scrape was successful
3010
+ * markdown - Markdown content if requested
3011
+ * html - HTML content if requested
3012
+ * rawHtml - Raw HTML content if requested
3013
+ * links - Extracted links if requested
3014
+ * screenshot - Screenshot if requested
3015
+ * extract - Extracted data if requested
3016
+ * json - JSON data if requested
3017
+ * error - Error message if scrape failed
3018
+
3019
+ Raises:
3020
+ Exception: If scraping fails
3021
+ """
3022
+ # Validate any additional kwargs
3023
+ self._validate_kwargs(kwargs, "scrape_url")
3024
+
3025
+ headers = self._prepare_headers()
3026
+
3027
+ # Build scrape parameters
3028
+ scrape_params = {
3029
+ 'url': url,
3030
+ 'origin': f"python-sdk@{version}"
3031
+ }
3032
+
3033
+ # Add optional parameters if provided and not None
3034
+ if formats:
3035
+ scrape_params['formats'] = formats
3036
+ if include_tags:
3037
+ scrape_params['includeTags'] = include_tags
3038
+ if exclude_tags:
3039
+ scrape_params['excludeTags'] = exclude_tags
3040
+ if only_main_content is not None:
3041
+ scrape_params['onlyMainContent'] = only_main_content
3042
+ if wait_for:
3043
+ scrape_params['waitFor'] = wait_for
3044
+ if timeout:
3045
+ scrape_params['timeout'] = timeout
3046
+ if location:
3047
+ scrape_params['location'] = location.dict(exclude_none=True)
3048
+ if mobile is not None:
3049
+ scrape_params['mobile'] = mobile
3050
+ if skip_tls_verification is not None:
3051
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3052
+ if remove_base64_images is not None:
3053
+ scrape_params['removeBase64Images'] = remove_base64_images
3054
+ if block_ads is not None:
3055
+ scrape_params['blockAds'] = block_ads
3056
+ if proxy:
3057
+ scrape_params['proxy'] = proxy
3058
+ if parse_pdf is not None:
3059
+ scrape_params['parsePDF'] = parse_pdf
3060
+ if extract is not None:
3061
+ extract = self._ensure_schema_dict(extract)
3062
+ if isinstance(extract, dict) and "schema" in extract:
3063
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3064
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3065
+ if json_options is not None:
3066
+ json_options = self._ensure_schema_dict(json_options)
3067
+ if isinstance(json_options, dict) and "schema" in json_options:
3068
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3069
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3070
+ if actions:
3071
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
3072
+
3073
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
3074
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
3075
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3076
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3077
+
3078
+ # Make async request
3079
+ endpoint = f'/v1/scrape'
3080
+ response = await self._async_post_request(
3081
+ f'{self.api_url}{endpoint}',
3082
+ scrape_params,
3083
+ headers
3084
+ )
3085
+
3086
+ if response.get('success') and 'data' in response:
3087
+ return ScrapeResponse(**response['data'])
3088
+ elif "error" in response:
3089
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3090
+ else:
3091
+ # Use the response content directly if possible, otherwise a generic message
3092
+ error_content = response.get('error', str(response))
3093
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
3094
+
3095
+ async def batch_scrape_urls(
3096
+ self,
3097
+ urls: List[str],
3098
+ *,
3099
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3100
+ headers: Optional[Dict[str, str]] = None,
3101
+ include_tags: Optional[List[str]] = None,
3102
+ exclude_tags: Optional[List[str]] = None,
3103
+ only_main_content: Optional[bool] = None,
3104
+ wait_for: Optional[int] = None,
3105
+ timeout: Optional[int] = None,
3106
+ location: Optional[LocationConfig] = None,
3107
+ mobile: Optional[bool] = None,
3108
+ skip_tls_verification: Optional[bool] = None,
3109
+ remove_base64_images: Optional[bool] = None,
3110
+ block_ads: Optional[bool] = None,
3111
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3112
+ extract: Optional[JsonConfig] = None,
3113
+ json_options: Optional[JsonConfig] = None,
3114
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3115
+ agent: Optional[AgentOptions] = None,
3116
+ poll_interval: Optional[int] = 2,
3117
+ idempotency_key: Optional[str] = None,
3118
+ **kwargs
3119
+ ) -> BatchScrapeStatusResponse:
3120
+ """
3121
+ Asynchronously scrape multiple URLs and monitor until completion.
3122
+
3123
+ Args:
3124
+ urls (List[str]): URLs to scrape
3125
+ formats (Optional[List[Literal]]): Content formats to retrieve
3126
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3127
+ include_tags (Optional[List[str]]): HTML tags to include
3128
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3129
+ only_main_content (Optional[bool]): Extract main content only
3130
+ wait_for (Optional[int]): Wait time in milliseconds
3131
+ timeout (Optional[int]): Request timeout in milliseconds
3132
+ location (Optional[LocationConfig]): Location configuration
3133
+ mobile (Optional[bool]): Use mobile user agent
3134
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3135
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3136
+ block_ads (Optional[bool]): Block advertisements
3137
+ proxy (Optional[Literal]): Proxy type to use
3138
+ extract (Optional[JsonConfig]): Content extraction config
3139
+ json_options (Optional[JsonConfig]): JSON extraction config
3140
+ actions (Optional[List[Union]]): Actions to perform
3141
+ agent (Optional[AgentOptions]): Agent configuration
3142
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3143
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3144
+ **kwargs: Additional parameters to pass to the API
3145
+
3146
+ Returns:
3147
+ BatchScrapeStatusResponse with:
3148
+ * Scraping status and progress
3149
+ * Scraped content for each URL
3150
+ * Success/error information
3151
+
3152
+ Raises:
3153
+ Exception: If batch scrape fails
3154
+ """
3155
+ # Validate any additional kwargs
3156
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
3157
+
3158
+ scrape_params = {}
3159
+
3160
+ # Add individual parameters
3161
+ if formats is not None:
3162
+ scrape_params['formats'] = formats
3163
+ if headers is not None:
3164
+ scrape_params['headers'] = headers
3165
+ if include_tags is not None:
3166
+ scrape_params['includeTags'] = include_tags
3167
+ if exclude_tags is not None:
3168
+ scrape_params['excludeTags'] = exclude_tags
3169
+ if only_main_content is not None:
3170
+ scrape_params['onlyMainContent'] = only_main_content
3171
+ if wait_for is not None:
3172
+ scrape_params['waitFor'] = wait_for
3173
+ if timeout is not None:
3174
+ scrape_params['timeout'] = timeout
3175
+ if location is not None:
3176
+ scrape_params['location'] = location.dict(exclude_none=True)
3177
+ if mobile is not None:
3178
+ scrape_params['mobile'] = mobile
3179
+ if skip_tls_verification is not None:
3180
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3181
+ if remove_base64_images is not None:
3182
+ scrape_params['removeBase64Images'] = remove_base64_images
3183
+ if block_ads is not None:
3184
+ scrape_params['blockAds'] = block_ads
3185
+ if proxy is not None:
3186
+ scrape_params['proxy'] = proxy
3187
+ if extract is not None:
3188
+ extract = self._ensure_schema_dict(extract)
3189
+ if isinstance(extract, dict) and "schema" in extract:
3190
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3191
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3192
+ if json_options is not None:
3193
+ json_options = self._ensure_schema_dict(json_options)
3194
+ if isinstance(json_options, dict) and "schema" in json_options:
3195
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3196
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3197
+ if actions is not None:
3198
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3199
+ if agent is not None:
3200
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3201
+
3202
+ # Add any additional kwargs
3203
+ scrape_params.update(kwargs)
3204
+
3205
+ # Create final params object
3206
+ final_params = ScrapeParams(**scrape_params)
3207
+ params_dict = final_params.dict(exclude_none=True)
3208
+ params_dict['urls'] = urls
3209
+ params_dict['origin'] = f"python-sdk@{version}"
3210
+
3211
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3212
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3213
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3214
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3215
+
3216
+ # Make request
3217
+ headers = self._prepare_headers(idempotency_key)
3218
+ response = await self._async_post_request(
3219
+ f'{self.api_url}/v1/batch/scrape',
3220
+ params_dict,
3221
+ headers
3222
+ )
3223
+
3224
+ if response.get('success'):
3225
+ try:
3226
+ id = response.get('id')
3227
+ except:
3228
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3229
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3230
+ else:
3231
+ self._handle_error(response, 'start batch scrape job')
3232
+
3233
+
3234
+ async def async_batch_scrape_urls(
3235
+ self,
3236
+ urls: List[str],
3237
+ *,
3238
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3239
+ headers: Optional[Dict[str, str]] = None,
3240
+ include_tags: Optional[List[str]] = None,
3241
+ exclude_tags: Optional[List[str]] = None,
3242
+ only_main_content: Optional[bool] = None,
3243
+ wait_for: Optional[int] = None,
3244
+ timeout: Optional[int] = None,
3245
+ location: Optional[LocationConfig] = None,
3246
+ mobile: Optional[bool] = None,
3247
+ skip_tls_verification: Optional[bool] = None,
3248
+ remove_base64_images: Optional[bool] = None,
3249
+ block_ads: Optional[bool] = None,
3250
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3251
+ extract: Optional[JsonConfig] = None,
3252
+ json_options: Optional[JsonConfig] = None,
3253
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3254
+ agent: Optional[AgentOptions] = None,
3255
+ zero_data_retention: Optional[bool] = None,
3256
+ idempotency_key: Optional[str] = None,
3257
+ **kwargs
3258
+ ) -> BatchScrapeResponse:
3259
+ """
3260
+ Initiate a batch scrape job asynchronously.
3261
+
3262
+ Args:
3263
+ urls (List[str]): URLs to scrape
3264
+ formats (Optional[List[Literal]]): Content formats to retrieve
3265
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3266
+ include_tags (Optional[List[str]]): HTML tags to include
3267
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3268
+ only_main_content (Optional[bool]): Extract main content only
3269
+ wait_for (Optional[int]): Wait time in milliseconds
3270
+ timeout (Optional[int]): Request timeout in milliseconds
3271
+ location (Optional[LocationConfig]): Location configuration
3272
+ mobile (Optional[bool]): Use mobile user agent
3273
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3274
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3275
+ block_ads (Optional[bool]): Block advertisements
3276
+ proxy (Optional[Literal]): Proxy type to use
3277
+ extract (Optional[JsonConfig]): Content extraction config
3278
+ json_options (Optional[JsonConfig]): JSON extraction config
3279
+ actions (Optional[List[Union]]): Actions to perform
3280
+ agent (Optional[AgentOptions]): Agent configuration
3281
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
3282
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3283
+ **kwargs: Additional parameters to pass to the API
3284
+
3285
+ Returns:
3286
+ BatchScrapeResponse with:
3287
+ * success - Whether job started successfully
3288
+ * id - Unique identifier for the job
3289
+ * url - Status check URL
3290
+ * error - Error message if start failed
3291
+
3292
+ Raises:
3293
+ Exception: If job initiation fails
3294
+ """
3295
+ # Validate any additional kwargs
3296
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3297
+
3298
+ scrape_params = {}
3299
+
3300
+ # Add individual parameters
3301
+ if formats is not None:
3302
+ scrape_params['formats'] = formats
3303
+ if headers is not None:
3304
+ scrape_params['headers'] = headers
3305
+ if include_tags is not None:
3306
+ scrape_params['includeTags'] = include_tags
3307
+ if exclude_tags is not None:
3308
+ scrape_params['excludeTags'] = exclude_tags
3309
+ if only_main_content is not None:
3310
+ scrape_params['onlyMainContent'] = only_main_content
3311
+ if wait_for is not None:
3312
+ scrape_params['waitFor'] = wait_for
3313
+ if timeout is not None:
3314
+ scrape_params['timeout'] = timeout
3315
+ if location is not None:
3316
+ scrape_params['location'] = location.dict(exclude_none=True)
3317
+ if mobile is not None:
3318
+ scrape_params['mobile'] = mobile
3319
+ if skip_tls_verification is not None:
3320
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3321
+ if remove_base64_images is not None:
3322
+ scrape_params['removeBase64Images'] = remove_base64_images
3323
+ if block_ads is not None:
3324
+ scrape_params['blockAds'] = block_ads
3325
+ if proxy is not None:
3326
+ scrape_params['proxy'] = proxy
3327
+ if extract is not None:
3328
+ extract = self._ensure_schema_dict(extract)
3329
+ if isinstance(extract, dict) and "schema" in extract:
3330
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3331
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3332
+ if json_options is not None:
3333
+ json_options = self._ensure_schema_dict(json_options)
3334
+ if isinstance(json_options, dict) and "schema" in json_options:
3335
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3336
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3337
+ if actions is not None:
3338
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3339
+ if agent is not None:
3340
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3341
+ if zero_data_retention is not None:
3342
+ scrape_params['zeroDataRetention'] = zero_data_retention
3343
+
3344
+ # Add any additional kwargs
3345
+ scrape_params.update(kwargs)
3346
+
3347
+ # Create final params object
3348
+ final_params = ScrapeParams(**scrape_params)
3349
+ params_dict = final_params.dict(exclude_none=True)
3350
+ params_dict['urls'] = urls
3351
+ params_dict['origin'] = f"python-sdk@{version}"
3352
+
3353
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3354
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3355
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3356
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3357
+
3358
+ # Make request
3359
+ headers = self._prepare_headers(idempotency_key)
3360
+ response = await self._async_post_request(
3361
+ f'{self.api_url}/v1/batch/scrape',
3362
+ params_dict,
3363
+ headers
3364
+ )
3365
+
3366
+ if response.get('status_code') == 200:
3367
+ try:
3368
+ return BatchScrapeResponse(**response.json())
3369
+ except:
3370
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3371
+ else:
3372
+ self._handle_error(response, 'start batch scrape job')
3373
+
3374
+ async def crawl_url(
3375
+ self,
3376
+ url: str,
3377
+ *,
3378
+ include_paths: Optional[List[str]] = None,
3379
+ exclude_paths: Optional[List[str]] = None,
3380
+ max_depth: Optional[int] = None,
3381
+ max_discovery_depth: Optional[int] = None,
3382
+ limit: Optional[int] = None,
3383
+ allow_backward_links: Optional[bool] = None,
3384
+ crawl_entire_domain: Optional[bool] = None,
3385
+ allow_external_links: Optional[bool] = None,
3386
+ ignore_sitemap: Optional[bool] = None,
3387
+ scrape_options: Optional[ScrapeOptions] = None,
3388
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3389
+ deduplicate_similar_urls: Optional[bool] = None,
3390
+ ignore_query_parameters: Optional[bool] = None,
3391
+ regex_on_full_url: Optional[bool] = None,
3392
+ delay: Optional[int] = None,
3393
+ allow_subdomains: Optional[bool] = None,
3394
+ poll_interval: Optional[int] = 2,
3395
+ idempotency_key: Optional[str] = None,
3396
+ **kwargs
3397
+ ) -> CrawlStatusResponse:
3398
+ """
3399
+ Crawl a website starting from a URL.
3400
+
3401
+ Args:
3402
+ url (str): Target URL to start crawling from
3403
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3404
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3405
+ max_depth (Optional[int]): Maximum crawl depth
3406
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3407
+ limit (Optional[int]): Maximum pages to crawl
3408
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3409
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3410
+ allow_external_links (Optional[bool]): Follow external domain links
3411
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3412
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3413
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3414
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3415
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3416
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3417
+ delay (Optional[int]): Delay in seconds between scrapes
3418
+ allow_subdomains (Optional[bool]): Follow subdomains
3419
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3420
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3421
+ **kwargs: Additional parameters to pass to the API
3422
+
3423
+ Returns:
3424
+ CrawlStatusResponse with:
3425
+ * Crawling status and progress
3426
+ * Crawled page contents
3427
+ * Success/error information
3428
+
3429
+ Raises:
3430
+ Exception: If crawl fails
3431
+ """
3432
+ # Validate any additional kwargs
3433
+ self._validate_kwargs(kwargs, "crawl_url")
3434
+
3435
+ crawl_params = {}
3436
+
3437
+ # Add individual parameters
3438
+ if include_paths is not None:
3439
+ crawl_params['includePaths'] = include_paths
3440
+ if exclude_paths is not None:
3441
+ crawl_params['excludePaths'] = exclude_paths
3442
+ if max_depth is not None:
3443
+ crawl_params['maxDepth'] = max_depth
3444
+ if max_discovery_depth is not None:
3445
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3446
+ if limit is not None:
3447
+ crawl_params['limit'] = limit
3448
+ if crawl_entire_domain is not None:
3449
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3450
+ elif allow_backward_links is not None:
3451
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3452
+ if allow_external_links is not None:
3453
+ crawl_params['allowExternalLinks'] = allow_external_links
3454
+ if ignore_sitemap is not None:
3455
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3456
+ if scrape_options is not None:
3457
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3458
+ if webhook is not None:
3459
+ crawl_params['webhook'] = webhook
3460
+ if deduplicate_similar_urls is not None:
3461
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3462
+ if ignore_query_parameters is not None:
3463
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3464
+ if regex_on_full_url is not None:
3465
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3466
+ if delay is not None:
3467
+ crawl_params['delay'] = delay
3468
+ if allow_subdomains is not None:
3469
+ crawl_params['allowSubdomains'] = allow_subdomains
3470
+
3471
+ # Add any additional kwargs
3472
+ crawl_params.update(kwargs)
3473
+
3474
+ # Create final params object
3475
+ final_params = CrawlParams(**crawl_params)
3476
+ params_dict = final_params.dict(exclude_none=True)
3477
+ params_dict['url'] = url
3478
+ params_dict['origin'] = f"python-sdk@{version}"
3479
+ # Make request
3480
+ headers = self._prepare_headers(idempotency_key)
3481
+ response = await self._async_post_request(
3482
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3483
+
3484
+ if response.get('success'):
3485
+ try:
3486
+ id = response.get('id')
3487
+ except:
3488
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3489
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3490
+ else:
3491
+ self._handle_error(response, 'start crawl job')
3492
+
3493
+
3494
+ async def async_crawl_url(
3495
+ self,
3496
+ url: str,
3497
+ *,
3498
+ include_paths: Optional[List[str]] = None,
3499
+ exclude_paths: Optional[List[str]] = None,
3500
+ max_depth: Optional[int] = None,
3501
+ max_discovery_depth: Optional[int] = None,
3502
+ limit: Optional[int] = None,
3503
+ allow_backward_links: Optional[bool] = None,
3504
+ crawl_entire_domain: Optional[bool] = None,
3505
+ allow_external_links: Optional[bool] = None,
3506
+ ignore_sitemap: Optional[bool] = None,
3507
+ scrape_options: Optional[ScrapeOptions] = None,
3508
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3509
+ deduplicate_similar_urls: Optional[bool] = None,
3510
+ ignore_query_parameters: Optional[bool] = None,
3511
+ regex_on_full_url: Optional[bool] = None,
3512
+ delay: Optional[int] = None,
3513
+ allow_subdomains: Optional[bool] = None,
3514
+ poll_interval: Optional[int] = 2,
3515
+ idempotency_key: Optional[str] = None,
3516
+ **kwargs
3517
+ ) -> CrawlResponse:
3518
+ """
3519
+ Start an asynchronous crawl job.
3520
+
3521
+ Args:
3522
+ url (str): Target URL to start crawling from
3523
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3524
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3525
+ max_depth (Optional[int]): Maximum crawl depth
3526
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3527
+ limit (Optional[int]): Maximum pages to crawl
3528
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3529
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3530
+ allow_external_links (Optional[bool]): Follow external domain links
3531
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3532
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3533
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3534
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3535
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3536
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3537
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3538
+ **kwargs: Additional parameters to pass to the API
3539
+
3540
+ Returns:
3541
+ CrawlResponse with:
3542
+ * success - Whether crawl started successfully
3543
+ * id - Unique identifier for the crawl job
3544
+ * url - Status check URL for the crawl
3545
+ * error - Error message if start failed
3546
+
3547
+ Raises:
3548
+ Exception: If crawl initiation fails
3549
+ """
3550
+ crawl_params = {}
3551
+
3552
+ # Add individual parameters
3553
+ if include_paths is not None:
3554
+ crawl_params['includePaths'] = include_paths
3555
+ if exclude_paths is not None:
3556
+ crawl_params['excludePaths'] = exclude_paths
3557
+ if max_depth is not None:
3558
+ crawl_params['maxDepth'] = max_depth
3559
+ if max_discovery_depth is not None:
3560
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3561
+ if limit is not None:
3562
+ crawl_params['limit'] = limit
3563
+ if crawl_entire_domain is not None:
3564
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3565
+ elif allow_backward_links is not None:
3566
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3567
+ if allow_external_links is not None:
3568
+ crawl_params['allowExternalLinks'] = allow_external_links
3569
+ if ignore_sitemap is not None:
3570
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3571
+ if scrape_options is not None:
3572
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3573
+ if webhook is not None:
3574
+ crawl_params['webhook'] = webhook
3575
+ if deduplicate_similar_urls is not None:
3576
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3577
+ if ignore_query_parameters is not None:
3578
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3579
+ if regex_on_full_url is not None:
3580
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3581
+ if delay is not None:
3582
+ crawl_params['delay'] = delay
3583
+ if allow_subdomains is not None:
3584
+ crawl_params['allowSubdomains'] = allow_subdomains
3585
+
3586
+ # Add any additional kwargs
3587
+ crawl_params.update(kwargs)
3588
+
3589
+ # Create final params object
3590
+ final_params = CrawlParams(**crawl_params)
3591
+ params_dict = final_params.dict(exclude_none=True)
3592
+ params_dict['url'] = url
3593
+ params_dict['origin'] = f"python-sdk@{version}"
3594
+
3595
+ # Make request
3596
+ headers = self._prepare_headers(idempotency_key)
3597
+ response = await self._async_post_request(
3598
+ f'{self.api_url}/v1/crawl',
3599
+ params_dict,
3600
+ headers
3601
+ )
3602
+
3603
+ if response.get('success'):
3604
+ try:
3605
+ return CrawlResponse(**response)
3606
+ except:
3607
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3608
+ else:
3609
+ self._handle_error(response, 'start crawl job')
3610
+
3611
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3612
+ """
3613
+ Check the status and results of an asynchronous crawl job.
3614
+
3615
+ Args:
3616
+ id (str): Unique identifier for the crawl job
3617
+
3618
+ Returns:
3619
+ CrawlStatusResponse containing:
3620
+ Status Information:
3621
+ * status - Current state (scraping/completed/failed/cancelled)
3622
+ * completed - Number of pages crawled
3623
+ * total - Total pages to crawl
3624
+ * creditsUsed - API credits consumed
3625
+ * expiresAt - Data expiration timestamp
3626
+
3627
+ Results:
3628
+ * data - List of crawled documents
3629
+ * next - URL for next page of results (if paginated)
3630
+ * success - Whether status check succeeded
3631
+ * error - Error message if failed
3632
+
3633
+ Raises:
3634
+ Exception: If status check fails
3635
+ """
3636
+ headers = self._prepare_headers()
3637
+ endpoint = f'/v1/crawl/{id}'
3638
+
3639
+ status_data = await self._async_get_request(
3640
+ f'{self.api_url}{endpoint}',
3641
+ headers
3642
+ )
3643
+
3644
+ if status_data.get('status') == 'completed':
3645
+ if 'data' in status_data:
3646
+ data = status_data['data']
3647
+ while 'next' in status_data:
3648
+ if len(status_data['data']) == 0:
3649
+ break
3650
+ next_url = status_data.get('next')
3651
+ if not next_url:
3652
+ logger.warning("Expected 'next' URL is missing.")
3653
+ break
3654
+ next_data = await self._async_get_request(next_url, headers)
3655
+ data.extend(next_data.get('data', []))
3656
+ status_data = next_data
3657
+ status_data['data'] = data
3658
+ # Create CrawlStatusResponse object from status data
3659
+ response = CrawlStatusResponse(
3660
+ status=status_data.get('status'),
3661
+ total=status_data.get('total'),
3662
+ completed=status_data.get('completed'),
3663
+ creditsUsed=status_data.get('creditsUsed'),
3664
+ expiresAt=status_data.get('expiresAt'),
3665
+ data=status_data.get('data'),
3666
+ success=False if 'error' in status_data else True
3667
+ )
3668
+
3669
+ if 'error' in status_data:
3670
+ response.error = status_data.get('error')
3671
+
3672
+ if 'next' in status_data:
3673
+ response.next = status_data.get('next')
3674
+
3675
+ return response
3676
+
3677
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3678
+ """
3679
+ Monitor the status of an asynchronous job until completion.
3680
+
3681
+ Args:
3682
+ id (str): The ID of the job to monitor
3683
+ headers (Dict[str, str]): Headers to include in status check requests
3684
+ poll_interval (int): Seconds between status checks (default: 2)
3685
+
3686
+ Returns:
3687
+ CrawlStatusResponse: The job results if completed successfully
3688
+
3689
+ Raises:
3690
+ Exception: If the job fails or an error occurs during status checks
3691
+ """
3692
+ while True:
3693
+ status_data = await self._async_get_request(
3694
+ f'{self.api_url}/v1/crawl/{id}',
3695
+ headers
3696
+ )
3697
+
3698
+ if status_data.get('status') == 'completed':
3699
+ if 'data' in status_data:
3700
+ data = status_data['data']
3701
+ while 'next' in status_data:
3702
+ if len(status_data['data']) == 0:
3703
+ break
3704
+ next_url = status_data.get('next')
3705
+ if not next_url:
3706
+ logger.warning("Expected 'next' URL is missing.")
3707
+ break
3708
+ next_data = await self._async_get_request(next_url, headers)
3709
+ data.extend(next_data.get('data', []))
3710
+ status_data = next_data
3711
+ status_data['data'] = data
3712
+ return CrawlStatusResponse(**status_data)
3713
+ else:
3714
+ raise Exception('Job completed but no data was returned')
3715
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3716
+ await asyncio.sleep(max(poll_interval, 2))
3717
+ else:
3718
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3719
+
3720
+ async def map_url(
3721
+ self,
3722
+ url: str,
3723
+ *,
3724
+ search: Optional[str] = None,
3725
+ ignore_sitemap: Optional[bool] = None,
3726
+ include_subdomains: Optional[bool] = None,
3727
+ sitemap_only: Optional[bool] = None,
3728
+ limit: Optional[int] = None,
3729
+ timeout: Optional[int] = None,
3730
+ params: Optional[MapParams] = None) -> MapResponse:
3731
+ """
3732
+ Asynchronously map and discover links from a URL.
3733
+
3734
+ Args:
3735
+ url (str): Target URL to map
3736
+ params (Optional[MapParams]): See MapParams model:
3737
+ Discovery Options:
3738
+ * search - Filter pattern for URLs
3739
+ * ignoreSitemap - Skip sitemap.xml
3740
+ * includeSubdomains - Include subdomain links
3741
+ * sitemapOnly - Only use sitemap.xml
3742
+
3743
+ Limits:
3744
+ * limit - Max URLs to return
3745
+ * timeout - Request timeout (ms)
3746
+
3747
+ Returns:
3748
+ MapResponse with:
3749
+ * Discovered URLs
3750
+ * Success/error status
3751
+
3752
+ Raises:
3753
+ Exception: If mapping fails
3754
+ """
3755
+ map_params = {}
3756
+ if params:
3757
+ map_params.update(params.dict(exclude_none=True))
3758
+
3759
+ # Add individual parameters
3760
+ if search is not None:
3761
+ map_params['search'] = search
3762
+ if ignore_sitemap is not None:
3763
+ map_params['ignoreSitemap'] = ignore_sitemap
3764
+ if include_subdomains is not None:
3765
+ map_params['includeSubdomains'] = include_subdomains
3766
+ if sitemap_only is not None:
3767
+ map_params['sitemapOnly'] = sitemap_only
3768
+ if limit is not None:
3769
+ map_params['limit'] = limit
3770
+ if timeout is not None:
3771
+ map_params['timeout'] = timeout
3772
+
3773
+ # Create final params object
3774
+ final_params = MapParams(**map_params)
3775
+ params_dict = final_params.dict(exclude_none=True)
3776
+ params_dict['url'] = url
3777
+ params_dict['origin'] = f"python-sdk@{version}"
3778
+
3779
+ # Make request
3780
+ endpoint = f'/v1/map'
3781
+ response = await self._async_post_request(
3782
+ f'{self.api_url}{endpoint}',
3783
+ params_dict,
3784
+ headers={"Authorization": f"Bearer {self.api_key}"}
3785
+ )
3786
+
3787
+ if response.get('success') and 'links' in response:
3788
+ return MapResponse(**response)
3789
+ elif 'error' in response:
3790
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3791
+ else:
3792
+ raise Exception(f'Failed to map URL. Error: {response}')
3793
+
3794
+ async def extract(
3795
+ self,
3796
+ urls: Optional[List[str]] = None,
3797
+ *,
3798
+ prompt: Optional[str] = None,
3799
+ schema: Optional[Any] = None,
3800
+ system_prompt: Optional[str] = None,
3801
+ allow_external_links: Optional[bool] = False,
3802
+ enable_web_search: Optional[bool] = False,
3803
+ show_sources: Optional[bool] = False,
3804
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3805
+
3806
+ """
3807
+ Asynchronously extract structured information from URLs.
3808
+
3809
+ Args:
3810
+ urls (Optional[List[str]]): URLs to extract from
3811
+ prompt (Optional[str]): Custom extraction prompt
3812
+ schema (Optional[Any]): JSON schema/Pydantic model
3813
+ system_prompt (Optional[str]): System context
3814
+ allow_external_links (Optional[bool]): Follow external links
3815
+ enable_web_search (Optional[bool]): Enable web search
3816
+ show_sources (Optional[bool]): Include source URLs
3817
+ agent (Optional[Dict[str, Any]]): Agent configuration
3818
+
3819
+ Returns:
3820
+ ExtractResponse with:
3821
+ * Structured data matching schema
3822
+ * Source information if requested
3823
+ * Success/error status
3824
+
3825
+ Raises:
3826
+ ValueError: If prompt/schema missing or extraction fails
3827
+ """
3828
+ headers = self._prepare_headers()
3829
+
3830
+ if not prompt and not schema:
3831
+ raise ValueError("Either prompt or schema is required")
3832
+
3833
+ if not urls and not prompt:
3834
+ raise ValueError("Either urls or prompt is required")
3835
+
3836
+ if schema:
3837
+ schema = self._ensure_schema_dict(schema)
3838
+
3839
+ request_data = {
3840
+ 'urls': urls or [],
3841
+ 'allowExternalLinks': allow_external_links,
3842
+ 'enableWebSearch': enable_web_search,
3843
+ 'showSources': show_sources,
3844
+ 'schema': schema,
3845
+ 'origin': f'python-sdk@{get_version()}'
3846
+ }
3847
+
3848
+ # Only add prompt and systemPrompt if they exist
3849
+ if prompt:
3850
+ request_data['prompt'] = prompt
3851
+ if system_prompt:
3852
+ request_data['systemPrompt'] = system_prompt
3853
+
3854
+ if agent:
3855
+ request_data['agent'] = agent
3856
+
3857
+ response = await self._async_post_request(
3858
+ f'{self.api_url}/v1/extract',
3859
+ request_data,
3860
+ headers
3861
+ )
3862
+
3863
+ if response.get('success'):
3864
+ job_id = response.get('id')
3865
+ if not job_id:
3866
+ raise Exception('Job ID not returned from extract request.')
3867
+
3868
+ while True:
3869
+ status_data = await self._async_get_request(
3870
+ f'{self.api_url}/v1/extract/{job_id}',
3871
+ headers
3872
+ )
3873
+
3874
+ if status_data['status'] == 'completed':
3875
+ return ExtractResponse(**status_data)
3876
+ elif status_data['status'] in ['failed', 'cancelled']:
3877
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3878
+
3879
+ await asyncio.sleep(2)
3880
+ else:
3881
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3882
+
3883
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3884
+ """
3885
+ Check the status of an asynchronous batch scrape job.
3886
+
3887
+ Args:
3888
+ id (str): The ID of the batch scrape job
3889
+
3890
+ Returns:
3891
+ BatchScrapeStatusResponse containing:
3892
+ Status Information:
3893
+ * status - Current state (scraping/completed/failed/cancelled)
3894
+ * completed - Number of URLs scraped
3895
+ * total - Total URLs to scrape
3896
+ * creditsUsed - API credits consumed
3897
+ * expiresAt - Data expiration timestamp
3898
+
3899
+ Results:
3900
+ * data - List of scraped documents
3901
+ * next - URL for next page of results (if paginated)
3902
+ * success - Whether status check succeeded
3903
+ * error - Error message if failed
3904
+
3905
+ Raises:
3906
+ Exception: If status check fails
3907
+ """
3908
+ headers = self._prepare_headers()
3909
+ endpoint = f'/v1/batch/scrape/{id}'
3910
+
3911
+ status_data = await self._async_get_request(
3912
+ f'{self.api_url}{endpoint}',
3913
+ headers
3914
+ )
3915
+
3916
+ if status_data['status'] == 'completed':
3917
+ if 'data' in status_data:
3918
+ data = status_data['data']
3919
+ while 'next' in status_data:
3920
+ if len(status_data['data']) == 0:
3921
+ break
3922
+ next_url = status_data.get('next')
3923
+ if not next_url:
3924
+ logger.warning("Expected 'next' URL is missing.")
3925
+ break
3926
+ next_data = await self._async_get_request(next_url, headers)
3927
+ data.extend(next_data.get('data', []))
3928
+ status_data = next_data
3929
+ status_data['data'] = data
3930
+
3931
+ response = BatchScrapeStatusResponse(
3932
+ status=status_data.get('status'),
3933
+ total=status_data.get('total'),
3934
+ completed=status_data.get('completed'),
3935
+ creditsUsed=status_data.get('creditsUsed'),
3936
+ expiresAt=status_data.get('expiresAt'),
3937
+ data=status_data.get('data')
3938
+ )
3939
+
3940
+ if 'error' in status_data:
3941
+ response['error'] = status_data['error']
3942
+
3943
+ if 'next' in status_data:
3944
+ response['next'] = status_data['next']
3945
+
3946
+ return {
3947
+ 'success': False if 'error' in status_data else True,
3948
+ **response
3949
+ }
3950
+
3951
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3952
+ """
3953
+ Get information about errors from an asynchronous batch scrape job.
3954
+
3955
+ Args:
3956
+ id (str): The ID of the batch scrape job
3957
+
3958
+ Returns:
3959
+ CrawlErrorsResponse containing:
3960
+ errors (List[Dict[str, str]]): List of errors with fields:
3961
+ * id (str): Error ID
3962
+ * timestamp (str): When the error occurred
3963
+ * url (str): URL that caused the error
3964
+ * error (str): Error message
3965
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3966
+
3967
+ Raises:
3968
+ Exception: If error check fails
3969
+ """
3970
+ headers = self._prepare_headers()
3971
+ return await self._async_get_request(
3972
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3973
+ headers
3974
+ )
3975
+
3976
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3977
+ """
3978
+ Get information about errors from an asynchronous crawl job.
3979
+
3980
+ Args:
3981
+ id (str): The ID of the crawl job
3982
+
3983
+ Returns:
3984
+ CrawlErrorsResponse containing:
3985
+ * errors (List[Dict[str, str]]): List of errors with fields:
3986
+ - id (str): Error ID
3987
+ - timestamp (str): When the error occurred
3988
+ - url (str): URL that caused the error
3989
+ - error (str): Error message
3990
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3991
+
3992
+ Raises:
3993
+ Exception: If error check fails
3994
+ """
3995
+ headers = self._prepare_headers()
3996
+ return await self._async_get_request(
3997
+ f'{self.api_url}/v1/crawl/{id}/errors',
3998
+ headers
3999
+ )
4000
+
4001
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
4002
+ """
4003
+ Cancel an asynchronous crawl job.
4004
+
4005
+ Args:
4006
+ id (str): The ID of the crawl job to cancel
4007
+
4008
+ Returns:
4009
+ Dict[str, Any] containing:
4010
+ * success (bool): Whether cancellation was successful
4011
+ * error (str, optional): Error message if cancellation failed
4012
+
4013
+ Raises:
4014
+ Exception: If cancellation fails
4015
+ """
4016
+ headers = self._prepare_headers()
4017
+ async with aiohttp.ClientSession() as session:
4018
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
4019
+ return await response.json()
4020
+
4021
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
4022
+ """
4023
+ Check the status of an asynchronous extraction job.
4024
+
4025
+ Args:
4026
+ job_id (str): The ID of the extraction job
4027
+
4028
+ Returns:
4029
+ ExtractResponse[Any] with:
4030
+ * success (bool): Whether request succeeded
4031
+ * data (Optional[Any]): Extracted data matching schema
4032
+ * error (Optional[str]): Error message if any
4033
+ * warning (Optional[str]): Warning message if any
4034
+ * sources (Optional[List[str]]): Source URLs if requested
4035
+
4036
+ Raises:
4037
+ ValueError: If status check fails
4038
+ """
4039
+ headers = self._prepare_headers()
4040
+ try:
4041
+ return await self._async_get_request(
4042
+ f'{self.api_url}/v1/extract/{job_id}',
4043
+ headers
4044
+ )
4045
+ except Exception as e:
4046
+ raise ValueError(str(e))
4047
+
4048
+ async def async_extract(
4049
+ self,
4050
+ urls: Optional[List[str]] = None,
4051
+ *,
4052
+ prompt: Optional[str] = None,
4053
+ schema: Optional[Any] = None,
4054
+ system_prompt: Optional[str] = None,
4055
+ allow_external_links: Optional[bool] = False,
4056
+ enable_web_search: Optional[bool] = False,
4057
+ show_sources: Optional[bool] = False,
4058
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
4059
+ """
4060
+ Initiate an asynchronous extraction job without waiting for completion.
4061
+
4062
+ Args:
4063
+ urls (Optional[List[str]]): URLs to extract from
4064
+ prompt (Optional[str]): Custom extraction prompt
4065
+ schema (Optional[Any]): JSON schema/Pydantic model
4066
+ system_prompt (Optional[str]): System context
4067
+ allow_external_links (Optional[bool]): Follow external links
4068
+ enable_web_search (Optional[bool]): Enable web search
4069
+ show_sources (Optional[bool]): Include source URLs
4070
+ agent (Optional[Dict[str, Any]]): Agent configuration
4071
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
4072
+
4073
+ Returns:
4074
+ ExtractResponse[Any] with:
4075
+ * success (bool): Whether request succeeded
4076
+ * data (Optional[Any]): Extracted data matching schema
4077
+ * error (Optional[str]): Error message if any
4078
+
4079
+ Raises:
4080
+ ValueError: If job initiation fails
4081
+ """
4082
+ headers = self._prepare_headers()
4083
+
4084
+ if not prompt and not schema:
4085
+ raise ValueError("Either prompt or schema is required")
4086
+
4087
+ if not urls and not prompt:
4088
+ raise ValueError("Either urls or prompt is required")
4089
+
4090
+ if schema:
4091
+ schema = self._ensure_schema_dict(schema)
4092
+
4093
+ request_data = ExtractResponse(
4094
+ urls=urls or [],
4095
+ allowExternalLinks=allow_external_links,
4096
+ enableWebSearch=enable_web_search,
4097
+ showSources=show_sources,
4098
+ schema=schema,
4099
+ origin=f'python-sdk@{version}'
4100
+ )
4101
+
4102
+ if prompt:
4103
+ request_data['prompt'] = prompt
4104
+ if system_prompt:
4105
+ request_data['systemPrompt'] = system_prompt
4106
+ if agent:
4107
+ request_data['agent'] = agent
4108
+
4109
+ try:
4110
+ return await self._async_post_request(
4111
+ f'{self.api_url}/v1/extract',
4112
+ request_data,
4113
+ headers
4114
+ )
4115
+ except Exception as e:
4116
+ raise ValueError(str(e))
4117
+
4118
+ async def generate_llms_text(
4119
+ self,
4120
+ url: str,
4121
+ *,
4122
+ max_urls: Optional[int] = None,
4123
+ show_full_text: Optional[bool] = None,
4124
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
4125
+ """
4126
+ Generate LLMs.txt for a given URL and monitor until completion.
4127
+
4128
+ Args:
4129
+ url (str): Target URL to generate LLMs.txt from
4130
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4131
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4132
+ experimental_stream (Optional[bool]): Enable experimental streaming
4133
+
4134
+ Returns:
4135
+ GenerateLLMsTextStatusResponse containing:
4136
+ * success (bool): Whether generation completed successfully
4137
+ * status (str): Status of generation (processing/completed/failed)
4138
+ * data (Dict[str, str], optional): Generated text with fields:
4139
+ - llmstxt (str): Generated LLMs.txt content
4140
+ - llmsfulltxt (str, optional): Full version if requested
4141
+ * error (str, optional): Error message if generation failed
4142
+ * expiresAt (str): When the generated data expires
4143
+
4144
+ Raises:
4145
+ Exception: If generation fails
4146
+ """
4147
+ params = {}
4148
+ if max_urls is not None:
4149
+ params['maxUrls'] = max_urls
4150
+ if show_full_text is not None:
4151
+ params['showFullText'] = show_full_text
4152
+ if experimental_stream is not None:
4153
+ params['__experimental_stream'] = experimental_stream
4154
+
4155
+ response = await self.async_generate_llms_text(
4156
+ url,
4157
+ max_urls=max_urls,
4158
+ show_full_text=show_full_text,
4159
+ cache=cache,
4160
+ experimental_stream=experimental_stream
4161
+ )
4162
+ if not response.get('success') or 'id' not in response:
4163
+ return response
4164
+
4165
+ job_id = response['id']
4166
+ while True:
4167
+ status = await self.check_generate_llms_text_status(job_id)
4168
+
4169
+ if status['status'] == 'completed':
4170
+ return status
4171
+ elif status['status'] == 'failed':
4172
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4173
+ elif status['status'] != 'processing':
4174
+ break
4175
+
4176
+ await asyncio.sleep(2)
4177
+
4178
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4179
+
4180
+ async def async_generate_llms_text(
4181
+ self,
4182
+ url: str,
4183
+ *,
4184
+ max_urls: Optional[int] = None,
4185
+ show_full_text: Optional[bool] = None,
4186
+ cache: Optional[bool] = None,
4187
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4188
+ """
4189
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4190
+
4191
+ Args:
4192
+ url (str): Target URL to generate LLMs.txt from
4193
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4194
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4195
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
4196
+ experimental_stream (Optional[bool]): Enable experimental streaming
4197
+
4198
+ Returns:
4199
+ GenerateLLMsTextResponse containing:
4200
+ * success (bool): Whether job started successfully
4201
+ * id (str): Unique identifier for the job
4202
+ * error (str, optional): Error message if start failed
4203
+
4204
+ Raises:
4205
+ ValueError: If job initiation fails
4206
+ """
4207
+ params = {}
4208
+ if max_urls is not None:
4209
+ params['maxUrls'] = max_urls
4210
+ if show_full_text is not None:
4211
+ params['showFullText'] = show_full_text
4212
+ if experimental_stream is not None:
4213
+ params['__experimental_stream'] = experimental_stream
4214
+
4215
+ params = GenerateLLMsTextParams(
4216
+ maxUrls=max_urls,
4217
+ showFullText=show_full_text,
4218
+ cache=cache,
4219
+ __experimental_stream=experimental_stream
4220
+ )
4221
+
4222
+ headers = self._prepare_headers()
4223
+ json_data = {'url': url, **params.dict(exclude_none=True)}
4224
+ json_data['origin'] = f"python-sdk@{version}"
4225
+
4226
+ try:
4227
+ return await self._async_post_request(
4228
+ f'{self.api_url}/v1/llmstxt',
4229
+ json_data,
4230
+ headers
4231
+ )
4232
+ except Exception as e:
4233
+ raise ValueError(str(e))
4234
+
4235
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4236
+ """
4237
+ Check the status of an asynchronous LLMs.txt generation job.
4238
+
4239
+ Args:
4240
+ id (str): The ID of the generation job
4241
+
4242
+ Returns:
4243
+ GenerateLLMsTextStatusResponse containing:
4244
+ * success (bool): Whether generation completed successfully
4245
+ * status (str): Status of generation (processing/completed/failed)
4246
+ * data (Dict[str, str], optional): Generated text with fields:
4247
+ - llmstxt (str): Generated LLMs.txt content
4248
+ - llmsfulltxt (str, optional): Full version if requested
4249
+ * error (str, optional): Error message if generation failed
4250
+ * expiresAt (str): When the generated data expires
4251
+
4252
+ Raises:
4253
+ ValueError: If status check fails
4254
+ """
4255
+ headers = self._prepare_headers()
4256
+ try:
4257
+ return await self._async_get_request(
4258
+ f'{self.api_url}/v1/llmstxt/{id}',
4259
+ headers
4260
+ )
4261
+ except Exception as e:
4262
+ raise ValueError(str(e))
4263
+
4264
+ async def deep_research(
4265
+ self,
4266
+ query: str,
4267
+ *,
4268
+ max_depth: Optional[int] = None,
4269
+ time_limit: Optional[int] = None,
4270
+ max_urls: Optional[int] = None,
4271
+ analysis_prompt: Optional[str] = None,
4272
+ system_prompt: Optional[str] = None,
4273
+ __experimental_stream_steps: Optional[bool] = None,
4274
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4275
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4276
+ """
4277
+ Initiates a deep research operation on a given query and polls until completion.
4278
+
4279
+ Args:
4280
+ query (str): Research query or topic to investigate
4281
+ max_depth (Optional[int]): Maximum depth of research exploration
4282
+ time_limit (Optional[int]): Time limit in seconds for research
4283
+ max_urls (Optional[int]): Maximum number of URLs to process
4284
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4285
+ system_prompt (Optional[str]): Custom system prompt
4286
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4287
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4288
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4289
+
4290
+ Returns:
4291
+ DeepResearchStatusResponse containing:
4292
+ * success (bool): Whether research completed successfully
4293
+ * status (str): Current state (processing/completed/failed)
4294
+ * error (Optional[str]): Error message if failed
4295
+ * id (str): Unique identifier for the research job
4296
+ * data (Any): Research findings and analysis
4297
+ * sources (List[Dict]): List of discovered sources
4298
+ * activities (List[Dict]): Research progress log
4299
+ * summaries (List[str]): Generated research summaries
4300
+
4301
+ Raises:
4302
+ Exception: If research fails
4303
+ """
4304
+ research_params = {}
4305
+ if max_depth is not None:
4306
+ research_params['maxDepth'] = max_depth
4307
+ if time_limit is not None:
4308
+ research_params['timeLimit'] = time_limit
4309
+ if max_urls is not None:
4310
+ research_params['maxUrls'] = max_urls
4311
+ if analysis_prompt is not None:
4312
+ research_params['analysisPrompt'] = analysis_prompt
4313
+ if system_prompt is not None:
4314
+ research_params['systemPrompt'] = system_prompt
4315
+ if __experimental_stream_steps is not None:
4316
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4317
+ research_params = DeepResearchParams(**research_params)
4318
+
4319
+ response = await self.async_deep_research(
4320
+ query,
4321
+ max_depth=max_depth,
4322
+ time_limit=time_limit,
4323
+ max_urls=max_urls,
4324
+ analysis_prompt=analysis_prompt,
4325
+ system_prompt=system_prompt
4326
+ )
4327
+ if not response.get('success') or 'id' not in response:
4328
+ return response
4329
+
4330
+ job_id = response['id']
4331
+ last_activity_count = 0
4332
+ last_source_count = 0
4333
+
4334
+ while True:
4335
+ status = await self.check_deep_research_status(job_id)
4336
+
4337
+ if on_activity and 'activities' in status:
4338
+ new_activities = status['activities'][last_activity_count:]
4339
+ for activity in new_activities:
4340
+ on_activity(activity)
4341
+ last_activity_count = len(status['activities'])
4342
+
4343
+ if on_source and 'sources' in status:
4344
+ new_sources = status['sources'][last_source_count:]
4345
+ for source in new_sources:
4346
+ on_source(source)
4347
+ last_source_count = len(status['sources'])
4348
+
4349
+ if status['status'] == 'completed':
4350
+ return status
4351
+ elif status['status'] == 'failed':
4352
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4353
+ elif status['status'] != 'processing':
4354
+ break
4355
+
4356
+ await asyncio.sleep(2)
4357
+
4358
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4359
+
4360
+ async def async_deep_research(
4361
+ self,
4362
+ query: str,
4363
+ *,
4364
+ max_depth: Optional[int] = None,
4365
+ time_limit: Optional[int] = None,
4366
+ max_urls: Optional[int] = None,
4367
+ analysis_prompt: Optional[str] = None,
4368
+ system_prompt: Optional[str] = None,
4369
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4370
+ """
4371
+ Initiates an asynchronous deep research operation.
4372
+
4373
+ Args:
4374
+ query (str): Research query or topic to investigate
4375
+ max_depth (Optional[int]): Maximum depth of research exploration
4376
+ time_limit (Optional[int]): Time limit in seconds for research
4377
+ max_urls (Optional[int]): Maximum number of URLs to process
4378
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4379
+ system_prompt (Optional[str]): Custom system prompt
4380
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4381
+
4382
+ Returns:
4383
+ Dict[str, Any]: A response containing:
4384
+ * success (bool): Whether the research initiation was successful
4385
+ * id (str): The unique identifier for the research job
4386
+ * error (str, optional): Error message if initiation failed
4387
+
4388
+ Raises:
4389
+ Exception: If the research initiation fails.
4390
+ """
4391
+ research_params = {}
4392
+ if max_depth is not None:
4393
+ research_params['maxDepth'] = max_depth
4394
+ if time_limit is not None:
4395
+ research_params['timeLimit'] = time_limit
4396
+ if max_urls is not None:
4397
+ research_params['maxUrls'] = max_urls
4398
+ if analysis_prompt is not None:
4399
+ research_params['analysisPrompt'] = analysis_prompt
4400
+ if system_prompt is not None:
4401
+ research_params['systemPrompt'] = system_prompt
4402
+ if __experimental_stream_steps is not None:
4403
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4404
+ research_params = DeepResearchParams(**research_params)
4405
+
4406
+ headers = self._prepare_headers()
4407
+
4408
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4409
+ json_data['origin'] = f"python-sdk@{version}"
4410
+
4411
+ try:
4412
+ return await self._async_post_request(
4413
+ f'{self.api_url}/v1/deep-research',
4414
+ json_data,
4415
+ headers
4416
+ )
4417
+ except Exception as e:
4418
+ raise ValueError(str(e))
4419
+
4420
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4421
+ """
4422
+ Check the status of a deep research operation.
4423
+
4424
+ Args:
4425
+ id (str): The ID of the deep research operation.
4426
+
4427
+ Returns:
4428
+ DeepResearchResponse containing:
4429
+
4430
+ Status:
4431
+ * success - Whether research completed successfully
4432
+ * status - Current state (processing/completed/failed)
4433
+ * error - Error message if failed
4434
+
4435
+ Results:
4436
+ * id - Unique identifier for the research job
4437
+ * data - Research findings and analysis
4438
+ * sources - List of discovered sources
4439
+ * activities - Research progress log
4440
+ * summaries - Generated research summaries
4441
+
4442
+ Raises:
4443
+ Exception: If the status check fails.
4444
+ """
4445
+ headers = self._prepare_headers()
4446
+ try:
4447
+ return await self._async_get_request(
4448
+ f'{self.api_url}/v1/deep-research/{id}',
4449
+ headers
4450
+ )
4451
+ except Exception as e:
4452
+ raise ValueError(str(e))
4453
+
4454
+ async def search(
4455
+ self,
4456
+ query: str,
4457
+ *,
4458
+ limit: Optional[int] = None,
4459
+ tbs: Optional[str] = None,
4460
+ filter: Optional[str] = None,
4461
+ lang: Optional[str] = None,
4462
+ country: Optional[str] = None,
4463
+ location: Optional[str] = None,
4464
+ timeout: Optional[int] = None,
4465
+ scrape_options: Optional[ScrapeOptions] = None,
4466
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4467
+ **kwargs) -> SearchResponse:
4468
+ """
4469
+ Asynchronously search for content using Firecrawl.
4470
+
4471
+ Args:
4472
+ query (str): Search query string
4473
+ limit (Optional[int]): Max results (default: 5)
4474
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4475
+ filter (Optional[str]): Custom result filter
4476
+ lang (Optional[str]): Language code (default: "en")
4477
+ country (Optional[str]): Country code (default: "us")
4478
+ location (Optional[str]): Geo-targeting
4479
+ timeout (Optional[int]): Request timeout in milliseconds
4480
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4481
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4482
+ **kwargs: Additional keyword arguments for future compatibility
4483
+
4484
+ Returns:
4485
+ SearchResponse: Response containing:
4486
+ * success (bool): Whether request succeeded
4487
+ * data (List[FirecrawlDocument]): Search results
4488
+ * warning (Optional[str]): Warning message if any
4489
+ * error (Optional[str]): Error message if any
4490
+
4491
+ Raises:
4492
+ Exception: If search fails or response cannot be parsed
4493
+ """
4494
+ # Build search parameters
4495
+ search_params = {}
4496
+ if params:
4497
+ if isinstance(params, dict):
4498
+ search_params.update(params)
4499
+ else:
4500
+ search_params.update(params.dict(exclude_none=True))
4501
+
4502
+ # Add individual parameters
4503
+ if limit is not None:
4504
+ search_params['limit'] = limit
4505
+ if tbs is not None:
4506
+ search_params['tbs'] = tbs
4507
+ if filter is not None:
4508
+ search_params['filter'] = filter
4509
+ if lang is not None:
4510
+ search_params['lang'] = lang
4511
+ if country is not None:
4512
+ search_params['country'] = country
4513
+ if location is not None:
4514
+ search_params['location'] = location
4515
+ if timeout is not None:
4516
+ search_params['timeout'] = timeout
4517
+ if scrape_options is not None:
4518
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4519
+
4520
+ # Add any additional kwargs
4521
+ search_params.update(kwargs)
4522
+
4523
+ # Create final params object
4524
+ final_params = SearchParams(query=query, **search_params)
4525
+ params_dict = final_params.dict(exclude_none=True)
4526
+ params_dict['origin'] = f"python-sdk@{version}"
4527
+
4528
+ return await self._async_post_request(
4529
+ f"{self.api_url}/v1/search",
4530
+ params_dict,
4531
+ {"Authorization": f"Bearer {self.api_key}"}
4532
+ )
4533
+
4534
+ class AsyncCrawlWatcher(CrawlWatcher):
4535
+ """
4536
+ Async version of CrawlWatcher that properly handles async operations.
4537
+ """
4538
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4539
+ super().__init__(id, app)
4540
+
4541
+ async def connect(self) -> None:
4542
+ """
4543
+ Establishes async WebSocket connection and starts listening for messages.
4544
+ """
4545
+ async with websockets.connect(
4546
+ self.ws_url,
4547
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4548
+ ) as websocket:
4549
+ await self._listen(websocket)
4550
+
4551
+ async def _listen(self, websocket) -> None:
4552
+ """
4553
+ Listens for incoming WebSocket messages and handles them asynchronously.
4554
+
4555
+ Args:
4556
+ websocket: The WebSocket connection object
4557
+ """
4558
+ async for message in websocket:
4559
+ msg = json.loads(message)
4560
+ await self._handle_message(msg)
4561
+
4562
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4563
+ """
4564
+ Handles incoming WebSocket messages based on their type asynchronously.
4565
+
4566
+ Args:
4567
+ msg (Dict[str, Any]): The message to handle
4568
+ """
4569
+ if msg['type'] == 'done':
4570
+ self.status = 'completed'
4571
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4572
+ elif msg['type'] == 'error':
4573
+ self.status = 'failed'
4574
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4575
+ elif msg['type'] == 'catchup':
4576
+ self.status = msg['data']['status']
4577
+ self.data.extend(msg['data'].get('data', []))
4578
+ for doc in self.data:
4579
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4580
+ elif msg['type'] == 'document':
4581
+ self.data.append(msg['data'])
4582
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4583
+
4584
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4585
+ """
4586
+ Handle errors from async API responses.
4587
+ """
4588
+ try:
4589
+ error_data = await response.json()
4590
+ error_message = error_data.get('error', 'No error message provided.')
4591
+ error_details = error_data.get('details', 'No additional error details provided.')
4592
+ except:
4593
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4594
+
4595
+ # Use the app's method to get the error message
4596
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4597
+
4598
+ raise aiohttp.ClientError(message)
4599
+
4600
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4601
+ """
4602
+ Generate a standardized error message based on HTTP status code for async operations.
4603
+
4604
+ Args:
4605
+ status_code (int): The HTTP status code from the response
4606
+ action (str): Description of the action that was being performed
4607
+ error_message (str): The error message from the API response
4608
+ error_details (str): Additional error details from the API response
4609
+
4610
+ Returns:
4611
+ str: A formatted error message
4612
+ """
4613
+ return self._get_error_message(status_code, action, error_message, error_details)