firecrawl-py 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

@@ -0,0 +1,4561 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
47
+
48
+ logger : logging.Logger = logging.getLogger("firecrawl")
49
+
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+ pdfs: List[str]
100
+
101
+ class ChangeTrackingData(pydantic.BaseModel):
102
+ """
103
+ Data for the change tracking format.
104
+ """
105
+ previousScrapeAt: Optional[str] = None
106
+ changeStatus: str # "new" | "same" | "changed" | "removed"
107
+ visibility: str # "visible" | "hidden"
108
+ diff: Optional[Dict[str, Any]] = None
109
+ json: Optional[Any] = None
110
+
111
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
112
+ """Document retrieved or processed by Firecrawl."""
113
+ url: Optional[str] = None
114
+ markdown: Optional[str] = None
115
+ html: Optional[str] = None
116
+ rawHtml: Optional[str] = None
117
+ links: Optional[List[str]] = None
118
+ extract: Optional[T] = None
119
+ json: Optional[T] = None
120
+ screenshot: Optional[str] = None
121
+ metadata: Optional[Any] = None
122
+ actions: Optional[ActionsResult] = None
123
+ title: Optional[str] = None # v1 search only
124
+ description: Optional[str] = None # v1 search only
125
+ changeTracking: Optional[ChangeTrackingData] = None
126
+
127
+ class LocationConfig(pydantic.BaseModel):
128
+ """Location configuration for scraping."""
129
+ country: Optional[str] = None
130
+ languages: Optional[List[str]] = None
131
+
132
+ class WebhookConfig(pydantic.BaseModel):
133
+ """Configuration for webhooks."""
134
+ url: str
135
+ headers: Optional[Dict[str, str]] = None
136
+ metadata: Optional[Dict[str, str]] = None
137
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
138
+
139
+ class ChangeTrackingOptions(pydantic.BaseModel):
140
+ """Configuration for change tracking."""
141
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
142
+ schema: Optional[Any] = None
143
+ prompt: Optional[str] = None
144
+ tag: Optional[str] = None
145
+
146
+ class ScrapeOptions(pydantic.BaseModel):
147
+ """Parameters for scraping operations."""
148
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
149
+ headers: Optional[Dict[str, str]] = None
150
+ includeTags: Optional[List[str]] = None
151
+ excludeTags: Optional[List[str]] = None
152
+ onlyMainContent: Optional[bool] = None
153
+ waitFor: Optional[int] = None
154
+ timeout: Optional[int] = None
155
+ location: Optional[LocationConfig] = None
156
+ mobile: Optional[bool] = None
157
+ skipTlsVerification: Optional[bool] = None
158
+ removeBase64Images: Optional[bool] = None
159
+ blockAds: Optional[bool] = None
160
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
161
+ changeTrackingOptions: Optional[ChangeTrackingOptions] = None
162
+ maxAge: Optional[int] = None
163
+ storeInCache: Optional[bool] = None
164
+ parsePDF: Optional[bool] = None
165
+
166
+ class WaitAction(pydantic.BaseModel):
167
+ """Wait action to perform during scraping."""
168
+ type: Literal["wait"]
169
+ milliseconds: Optional[int] = None
170
+ selector: Optional[str] = None
171
+
172
+ class ScreenshotAction(pydantic.BaseModel):
173
+ """Screenshot action to perform during scraping."""
174
+ type: Literal["screenshot"]
175
+ fullPage: Optional[bool] = None
176
+ quality: Optional[int] = None
177
+
178
+ class ClickAction(pydantic.BaseModel):
179
+ """Click action to perform during scraping."""
180
+ type: Literal["click"]
181
+ selector: str
182
+
183
+ class WriteAction(pydantic.BaseModel):
184
+ """Write action to perform during scraping."""
185
+ type: Literal["write"]
186
+ text: str
187
+
188
+ class PressAction(pydantic.BaseModel):
189
+ """Press action to perform during scraping."""
190
+ type: Literal["press"]
191
+ key: str
192
+
193
+ class ScrollAction(pydantic.BaseModel):
194
+ """Scroll action to perform during scraping."""
195
+ type: Literal["scroll"]
196
+ direction: Literal["up", "down"]
197
+ selector: Optional[str] = None
198
+
199
+ class ScrapeAction(pydantic.BaseModel):
200
+ """Scrape action to perform during scraping."""
201
+ type: Literal["scrape"]
202
+
203
+ class ExecuteJavascriptAction(pydantic.BaseModel):
204
+ """Execute javascript action to perform during scraping."""
205
+ type: Literal["executeJavascript"]
206
+ script: str
207
+
208
+ class PDFAction(pydantic.BaseModel):
209
+ """PDF action to perform during scraping."""
210
+ type: Literal["pdf"]
211
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
212
+ landscape: Optional[bool] = None
213
+ scale: Optional[float] = None
214
+
215
+ class ExtractAgent(pydantic.BaseModel):
216
+ """Configuration for the agent in extract operations."""
217
+ model: Literal["FIRE-1"] = "FIRE-1"
218
+
219
+ class JsonConfig(pydantic.BaseModel):
220
+ """Configuration for extraction."""
221
+ prompt: Optional[str] = None
222
+ schema: Optional[Any] = None
223
+ systemPrompt: Optional[str] = None
224
+ agent: Optional[ExtractAgent] = None
225
+
226
+ class ScrapeParams(ScrapeOptions):
227
+ """Parameters for scraping operations."""
228
+ extract: Optional[JsonConfig] = None
229
+ jsonOptions: Optional[JsonConfig] = None
230
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
231
+ agent: Optional[AgentOptions] = None
232
+ webhook: Optional[WebhookConfig] = None
233
+
234
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
235
+ """Response from scraping operations."""
236
+ success: bool = True
237
+ warning: Optional[str] = None
238
+ error: Optional[str] = None
239
+
240
+ class BatchScrapeResponse(pydantic.BaseModel):
241
+ """Response from batch scrape operations."""
242
+ id: Optional[str] = None
243
+ url: Optional[str] = None
244
+ success: bool = True
245
+ error: Optional[str] = None
246
+ invalidURLs: Optional[List[str]] = None
247
+
248
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
249
+ """Response from batch scrape status checks."""
250
+ success: bool = True
251
+ status: Literal["scraping", "completed", "failed", "cancelled"]
252
+ completed: int
253
+ total: int
254
+ creditsUsed: int
255
+ expiresAt: datetime
256
+ next: Optional[str] = None
257
+ data: List[FirecrawlDocument]
258
+
259
+ class CrawlParams(pydantic.BaseModel):
260
+ """Parameters for crawling operations."""
261
+ includePaths: Optional[List[str]] = None
262
+ excludePaths: Optional[List[str]] = None
263
+ maxDepth: Optional[int] = None
264
+ maxDiscoveryDepth: Optional[int] = None
265
+ limit: Optional[int] = None
266
+ allowBackwardLinks: Optional[bool] = None
267
+ allowExternalLinks: Optional[bool] = None
268
+ ignoreSitemap: Optional[bool] = None
269
+ scrapeOptions: Optional[ScrapeOptions] = None
270
+ webhook: Optional[Union[str, WebhookConfig]] = None
271
+ deduplicateSimilarURLs: Optional[bool] = None
272
+ ignoreQueryParameters: Optional[bool] = None
273
+ regexOnFullURL: Optional[bool] = None
274
+ delay: Optional[int] = None # Delay in seconds between scrapes
275
+ maxConcurrency: Optional[int] = None
276
+ allowSubdomains: Optional[bool] = None
277
+
278
+ class CrawlResponse(pydantic.BaseModel):
279
+ """Response from crawling operations."""
280
+ id: Optional[str] = None
281
+ url: Optional[str] = None
282
+ success: bool = True
283
+ error: Optional[str] = None
284
+
285
+ class CrawlStatusResponse(pydantic.BaseModel):
286
+ """Response from crawl status checks."""
287
+ success: bool = True
288
+ status: Literal["scraping", "completed", "failed", "cancelled"]
289
+ completed: int
290
+ total: int
291
+ creditsUsed: int
292
+ expiresAt: datetime
293
+ next: Optional[str] = None
294
+ data: List[FirecrawlDocument]
295
+
296
+ class CrawlErrorsResponse(pydantic.BaseModel):
297
+ """Response from crawl/batch scrape error monitoring."""
298
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
299
+ robotsBlocked: List[str]
300
+
301
+ class MapParams(pydantic.BaseModel):
302
+ """Parameters for mapping operations."""
303
+ search: Optional[str] = None
304
+ ignoreSitemap: Optional[bool] = None
305
+ includeSubdomains: Optional[bool] = None
306
+ sitemapOnly: Optional[bool] = None
307
+ limit: Optional[int] = None
308
+ timeout: Optional[int] = None
309
+ useIndex: Optional[bool] = None
310
+
311
+ class MapResponse(pydantic.BaseModel):
312
+ """Response from mapping operations."""
313
+ success: bool = True
314
+ links: Optional[List[str]] = None
315
+ error: Optional[str] = None
316
+
317
+ class ExtractParams(pydantic.BaseModel):
318
+ """Parameters for extracting information from URLs."""
319
+ prompt: Optional[str] = None
320
+ schema: Optional[Any] = None
321
+ systemPrompt: Optional[str] = None
322
+ allowExternalLinks: Optional[bool] = None
323
+ enableWebSearch: Optional[bool] = None
324
+ includeSubdomains: Optional[bool] = None
325
+ origin: Optional[str] = None
326
+ showSources: Optional[bool] = None
327
+ scrapeOptions: Optional[ScrapeOptions] = None
328
+
329
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
330
+ """Response from extract operations."""
331
+ id: Optional[str] = None
332
+ status: Optional[Literal["processing", "completed", "failed"]] = None
333
+ expiresAt: Optional[datetime] = None
334
+ success: bool = True
335
+ data: Optional[T] = None
336
+ error: Optional[str] = None
337
+ warning: Optional[str] = None
338
+ sources: Optional[Dict[Any, Any]] = None
339
+
340
+ class SearchParams(pydantic.BaseModel):
341
+ query: str
342
+ limit: Optional[int] = 5
343
+ tbs: Optional[str] = None
344
+ filter: Optional[str] = None
345
+ lang: Optional[str] = "en"
346
+ country: Optional[str] = "us"
347
+ location: Optional[str] = None
348
+ origin: Optional[str] = "api"
349
+ timeout: Optional[int] = 60000
350
+ scrapeOptions: Optional[ScrapeOptions] = None
351
+
352
+ class SearchResponse(pydantic.BaseModel):
353
+ """Response from search operations."""
354
+ success: bool = True
355
+ data: List[FirecrawlDocument]
356
+ warning: Optional[str] = None
357
+ error: Optional[str] = None
358
+
359
+ class GenerateLLMsTextParams(pydantic.BaseModel):
360
+ """
361
+ Parameters for the LLMs.txt generation operation.
362
+ """
363
+ maxUrls: Optional[int] = 10
364
+ showFullText: Optional[bool] = False
365
+ cache: Optional[bool] = True
366
+ __experimental_stream: Optional[bool] = None
367
+
368
+ class DeepResearchParams(pydantic.BaseModel):
369
+ """
370
+ Parameters for the deep research operation.
371
+ """
372
+ maxDepth: Optional[int] = 7
373
+ timeLimit: Optional[int] = 270
374
+ maxUrls: Optional[int] = 20
375
+ analysisPrompt: Optional[str] = None
376
+ systemPrompt: Optional[str] = None
377
+ __experimental_streamSteps: Optional[bool] = None
378
+
379
+ class DeepResearchResponse(pydantic.BaseModel):
380
+ """
381
+ Response from the deep research operation.
382
+ """
383
+ success: bool
384
+ id: str
385
+ error: Optional[str] = None
386
+
387
+ class DeepResearchStatusResponse(pydantic.BaseModel):
388
+ """
389
+ Status response from the deep research operation.
390
+ """
391
+ success: bool
392
+ data: Optional[Dict[str, Any]] = None
393
+ status: str
394
+ error: Optional[str] = None
395
+ expiresAt: str
396
+ currentDepth: int
397
+ maxDepth: int
398
+ activities: List[Dict[str, Any]]
399
+ sources: List[Dict[str, Any]]
400
+ summaries: List[str]
401
+
402
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
403
+ """Response from LLMs.txt generation operations."""
404
+ success: bool = True
405
+ id: str
406
+ error: Optional[str] = None
407
+
408
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
409
+ llmstxt: str
410
+ llmsfulltxt: Optional[str] = None
411
+
412
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
413
+ """Status response from LLMs.txt generation operations."""
414
+ success: bool = True
415
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
416
+ status: Literal["processing", "completed", "failed"]
417
+ error: Optional[str] = None
418
+ expiresAt: str
419
+
420
+ class SearchResponse(pydantic.BaseModel):
421
+ """
422
+ Response from the search operation.
423
+ """
424
+ success: bool
425
+ data: List[Dict[str, Any]]
426
+ warning: Optional[str] = None
427
+ error: Optional[str] = None
428
+
429
+ class ExtractParams(pydantic.BaseModel):
430
+ """
431
+ Parameters for the extract operation.
432
+ """
433
+ prompt: Optional[str] = None
434
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
435
+ system_prompt: Optional[str] = None
436
+ allow_external_links: Optional[bool] = False
437
+ enable_web_search: Optional[bool] = False
438
+ # Just for backwards compatibility
439
+ enableWebSearch: Optional[bool] = False
440
+ show_sources: Optional[bool] = False
441
+ agent: Optional[Dict[str, Any]] = None
442
+
443
+ class FirecrawlApp:
444
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
445
+ """
446
+ Initialize the FirecrawlApp instance with API key, API URL.
447
+
448
+ Args:
449
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
450
+ api_url (Optional[str]): Base URL for the Firecrawl API.
451
+ """
452
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
453
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
454
+
455
+ # Only require API key when using cloud service
456
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
457
+ logger.warning("No API key provided for cloud service")
458
+ raise ValueError('No API key provided')
459
+
460
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
461
+
462
+ def scrape_url(
463
+ self,
464
+ url: str,
465
+ *,
466
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
467
+ include_tags: Optional[List[str]] = None,
468
+ exclude_tags: Optional[List[str]] = None,
469
+ only_main_content: Optional[bool] = None,
470
+ wait_for: Optional[int] = None,
471
+ timeout: Optional[int] = None,
472
+ location: Optional[LocationConfig] = None,
473
+ mobile: Optional[bool] = None,
474
+ skip_tls_verification: Optional[bool] = None,
475
+ remove_base64_images: Optional[bool] = None,
476
+ block_ads: Optional[bool] = None,
477
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
478
+ parse_pdf: Optional[bool] = None,
479
+ extract: Optional[JsonConfig] = None,
480
+ json_options: Optional[JsonConfig] = None,
481
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
482
+ change_tracking_options: Optional[ChangeTrackingOptions] = None,
483
+ max_age: Optional[int] = None,
484
+ store_in_cache: Optional[bool] = None,
485
+ **kwargs) -> ScrapeResponse[Any]:
486
+ """
487
+ Scrape and extract content from a URL.
488
+
489
+ Args:
490
+ url (str): Target URL to scrape
491
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
492
+ include_tags (Optional[List[str]]): HTML tags to include
493
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
494
+ only_main_content (Optional[bool]): Extract main content only
495
+ wait_for (Optional[int]): Wait for a specific element to appear
496
+ timeout (Optional[int]): Request timeout (ms)
497
+ location (Optional[LocationConfig]): Location configuration
498
+ mobile (Optional[bool]): Use mobile user agent
499
+ skip_tls_verification (Optional[bool]): Skip TLS verification
500
+ remove_base64_images (Optional[bool]): Remove base64 images
501
+ block_ads (Optional[bool]): Block ads
502
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
503
+ extract (Optional[JsonConfig]): Content extraction settings
504
+ json_options (Optional[JsonConfig]): JSON extraction settings
505
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
506
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
507
+
508
+
509
+ Returns:
510
+ ScrapeResponse with:
511
+ * Requested content formats
512
+ * Page metadata
513
+ * Extraction results
514
+ * Success/error status
515
+
516
+ Raises:
517
+ Exception: If scraping fails
518
+ """
519
+ headers = self._prepare_headers()
520
+
521
+ # Build scrape parameters
522
+ scrape_params = {
523
+ 'url': url,
524
+ 'origin': f"python-sdk@{version}"
525
+ }
526
+
527
+ # Add optional parameters if provided
528
+ if formats:
529
+ scrape_params['formats'] = formats
530
+ if include_tags:
531
+ scrape_params['includeTags'] = include_tags
532
+ if exclude_tags:
533
+ scrape_params['excludeTags'] = exclude_tags
534
+ if only_main_content is not None:
535
+ scrape_params['onlyMainContent'] = only_main_content
536
+ if wait_for:
537
+ scrape_params['waitFor'] = wait_for
538
+ if timeout:
539
+ scrape_params['timeout'] = timeout
540
+ if location:
541
+ scrape_params['location'] = location.dict(exclude_none=True)
542
+ if mobile is not None:
543
+ scrape_params['mobile'] = mobile
544
+ if skip_tls_verification is not None:
545
+ scrape_params['skipTlsVerification'] = skip_tls_verification
546
+ if remove_base64_images is not None:
547
+ scrape_params['removeBase64Images'] = remove_base64_images
548
+ if block_ads is not None:
549
+ scrape_params['blockAds'] = block_ads
550
+ if proxy:
551
+ scrape_params['proxy'] = proxy
552
+ if parse_pdf is not None:
553
+ scrape_params['parsePDF'] = parse_pdf
554
+ if extract is not None:
555
+ extract = self._ensure_schema_dict(extract)
556
+ if isinstance(extract, dict) and "schema" in extract:
557
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
558
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
559
+ if json_options is not None:
560
+ json_options = self._ensure_schema_dict(json_options)
561
+ if isinstance(json_options, dict) and "schema" in json_options:
562
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
563
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
564
+ if actions:
565
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
566
+ if change_tracking_options:
567
+ scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
568
+ if max_age is not None:
569
+ scrape_params['maxAge'] = max_age
570
+ if store_in_cache is not None:
571
+ scrape_params['storeInCache'] = store_in_cache
572
+
573
+ scrape_params.update(kwargs)
574
+
575
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
576
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
577
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
578
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
579
+
580
+ # Make request
581
+ response = requests.post(
582
+ f'{self.api_url}/v1/scrape',
583
+ headers=headers,
584
+ json=scrape_params,
585
+ timeout=(timeout + 5000 if timeout else None)
586
+ )
587
+
588
+ if response.status_code == 200:
589
+ try:
590
+ response_json = response.json()
591
+ if response_json.get('success') and 'data' in response_json:
592
+ return ScrapeResponse(**response_json['data'])
593
+ elif "error" in response_json:
594
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
595
+ else:
596
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
597
+ except ValueError:
598
+ raise Exception('Failed to parse Firecrawl response as JSON.')
599
+ else:
600
+ self._handle_error(response, 'scrape URL')
601
+
602
+ def search(
603
+ self,
604
+ query: str,
605
+ *,
606
+ limit: Optional[int] = None,
607
+ tbs: Optional[str] = None,
608
+ filter: Optional[str] = None,
609
+ lang: Optional[str] = None,
610
+ country: Optional[str] = None,
611
+ location: Optional[str] = None,
612
+ timeout: Optional[int] = None,
613
+ scrape_options: Optional[ScrapeOptions] = None,
614
+ **kwargs) -> SearchResponse:
615
+ """
616
+ Search for content using Firecrawl.
617
+
618
+ Args:
619
+ query (str): Search query string
620
+ limit (Optional[int]): Max results (default: 5)
621
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
622
+ filter (Optional[str]): Custom result filter
623
+ lang (Optional[str]): Language code (default: "en")
624
+ country (Optional[str]): Country code (default: "us")
625
+ location (Optional[str]): Geo-targeting
626
+ timeout (Optional[int]): Request timeout in milliseconds
627
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
628
+ **kwargs: Additional keyword arguments for future compatibility
629
+
630
+ Returns:
631
+ SearchResponse: Response containing:
632
+ * success (bool): Whether request succeeded
633
+ * data (List[FirecrawlDocument]): Search results
634
+ * warning (Optional[str]): Warning message if any
635
+ * error (Optional[str]): Error message if any
636
+
637
+ Raises:
638
+ Exception: If search fails or response cannot be parsed
639
+ """
640
+ # Validate any additional kwargs
641
+ self._validate_kwargs(kwargs, "search")
642
+
643
+ # Build search parameters
644
+ search_params = {}
645
+
646
+ # Add individual parameters
647
+ if limit is not None:
648
+ search_params['limit'] = limit
649
+ if tbs is not None:
650
+ search_params['tbs'] = tbs
651
+ if filter is not None:
652
+ search_params['filter'] = filter
653
+ if lang is not None:
654
+ search_params['lang'] = lang
655
+ if country is not None:
656
+ search_params['country'] = country
657
+ if location is not None:
658
+ search_params['location'] = location
659
+ if timeout is not None:
660
+ search_params['timeout'] = timeout
661
+ if scrape_options is not None:
662
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
663
+
664
+ # Add any additional kwargs
665
+ search_params.update(kwargs)
666
+
667
+ # Create final params object
668
+ final_params = SearchParams(query=query, **search_params)
669
+ params_dict = final_params.dict(exclude_none=True)
670
+ params_dict['origin'] = f"python-sdk@{version}"
671
+
672
+ # Make request
673
+ response = requests.post(
674
+ f"{self.api_url}/v1/search",
675
+ headers={"Authorization": f"Bearer {self.api_key}"},
676
+ json=params_dict
677
+ )
678
+
679
+ if response.status_code == 200:
680
+ try:
681
+ response_json = response.json()
682
+ if response_json.get('success') and 'data' in response_json:
683
+ return SearchResponse(**response_json)
684
+ elif "error" in response_json:
685
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
686
+ else:
687
+ raise Exception(f'Search failed. Error: {response_json}')
688
+ except ValueError:
689
+ raise Exception('Failed to parse Firecrawl response as JSON.')
690
+ else:
691
+ self._handle_error(response, 'search')
692
+
693
+ def crawl_url(
694
+ self,
695
+ url: str,
696
+ *,
697
+ include_paths: Optional[List[str]] = None,
698
+ exclude_paths: Optional[List[str]] = None,
699
+ max_depth: Optional[int] = None,
700
+ max_discovery_depth: Optional[int] = None,
701
+ limit: Optional[int] = None,
702
+ allow_backward_links: Optional[bool] = None,
703
+ crawl_entire_domain: Optional[bool] = None,
704
+ allow_external_links: Optional[bool] = None,
705
+ ignore_sitemap: Optional[bool] = None,
706
+ scrape_options: Optional[ScrapeOptions] = None,
707
+ webhook: Optional[Union[str, WebhookConfig]] = None,
708
+ deduplicate_similar_urls: Optional[bool] = None,
709
+ ignore_query_parameters: Optional[bool] = None,
710
+ regex_on_full_url: Optional[bool] = None,
711
+ delay: Optional[int] = None,
712
+ allow_subdomains: Optional[bool] = None,
713
+ max_concurrency: Optional[int] = None,
714
+ poll_interval: Optional[int] = 2,
715
+ idempotency_key: Optional[str] = None,
716
+ **kwargs
717
+ ) -> CrawlStatusResponse:
718
+ """
719
+ Crawl a website starting from a URL.
720
+
721
+ Args:
722
+ url (str): Target URL to start crawling from
723
+ include_paths (Optional[List[str]]): Patterns of URLs to include
724
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
725
+ max_depth (Optional[int]): Maximum crawl depth
726
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
727
+ limit (Optional[int]): Maximum pages to crawl
728
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
729
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
730
+ allow_external_links (Optional[bool]): Follow external domain links
731
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
732
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
733
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
734
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
735
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
736
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
737
+ delay (Optional[int]): Delay in seconds between scrapes
738
+ allow_subdomains (Optional[bool]): Follow subdomains
739
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
740
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
741
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
742
+ **kwargs: Additional parameters to pass to the API
743
+
744
+ Returns:
745
+ CrawlStatusResponse with:
746
+ * Crawling status and progress
747
+ * Crawled page contents
748
+ * Success/error information
749
+
750
+ Raises:
751
+ Exception: If crawl fails
752
+ """
753
+ # Validate any additional kwargs
754
+ self._validate_kwargs(kwargs, "crawl_url")
755
+
756
+ crawl_params = {}
757
+
758
+ # Add individual parameters
759
+ if include_paths is not None:
760
+ crawl_params['includePaths'] = include_paths
761
+ if exclude_paths is not None:
762
+ crawl_params['excludePaths'] = exclude_paths
763
+ if max_depth is not None:
764
+ crawl_params['maxDepth'] = max_depth
765
+ if max_discovery_depth is not None:
766
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
767
+ if limit is not None:
768
+ crawl_params['limit'] = limit
769
+ if crawl_entire_domain is not None:
770
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
771
+ elif allow_backward_links is not None:
772
+ crawl_params['allowBackwardLinks'] = allow_backward_links
773
+ if allow_external_links is not None:
774
+ crawl_params['allowExternalLinks'] = allow_external_links
775
+ if ignore_sitemap is not None:
776
+ crawl_params['ignoreSitemap'] = ignore_sitemap
777
+ if scrape_options is not None:
778
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
779
+ if webhook is not None:
780
+ crawl_params['webhook'] = webhook
781
+ if deduplicate_similar_urls is not None:
782
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
783
+ if ignore_query_parameters is not None:
784
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
785
+ if regex_on_full_url is not None:
786
+ crawl_params['regexOnFullURL'] = regex_on_full_url
787
+ if delay is not None:
788
+ crawl_params['delay'] = delay
789
+ if allow_subdomains is not None:
790
+ crawl_params['allowSubdomains'] = allow_subdomains
791
+ if max_concurrency is not None:
792
+ crawl_params['maxConcurrency'] = max_concurrency
793
+
794
+ # Add any additional kwargs
795
+ crawl_params.update(kwargs)
796
+
797
+ # Create final params object
798
+ final_params = CrawlParams(**crawl_params)
799
+ params_dict = final_params.dict(exclude_none=True)
800
+ params_dict['url'] = url
801
+ params_dict['origin'] = f"python-sdk@{version}"
802
+
803
+ # Make request
804
+ headers = self._prepare_headers(idempotency_key)
805
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
806
+
807
+ if response.status_code == 200:
808
+ try:
809
+ id = response.json().get('id')
810
+ except:
811
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
812
+ return self._monitor_job_status(id, headers, poll_interval)
813
+ else:
814
+ self._handle_error(response, 'start crawl job')
815
+
816
+ def async_crawl_url(
817
+ self,
818
+ url: str,
819
+ *,
820
+ include_paths: Optional[List[str]] = None,
821
+ exclude_paths: Optional[List[str]] = None,
822
+ max_depth: Optional[int] = None,
823
+ max_discovery_depth: Optional[int] = None,
824
+ limit: Optional[int] = None,
825
+ allow_backward_links: Optional[bool] = None,
826
+ crawl_entire_domain: Optional[bool] = None,
827
+ allow_external_links: Optional[bool] = None,
828
+ ignore_sitemap: Optional[bool] = None,
829
+ scrape_options: Optional[ScrapeOptions] = None,
830
+ webhook: Optional[Union[str, WebhookConfig]] = None,
831
+ deduplicate_similar_urls: Optional[bool] = None,
832
+ ignore_query_parameters: Optional[bool] = None,
833
+ regex_on_full_url: Optional[bool] = None,
834
+ delay: Optional[int] = None,
835
+ allow_subdomains: Optional[bool] = None,
836
+ max_concurrency: Optional[int] = None,
837
+ idempotency_key: Optional[str] = None,
838
+ **kwargs
839
+ ) -> CrawlResponse:
840
+ """
841
+ Start an asynchronous crawl job.
842
+
843
+ Args:
844
+ url (str): Target URL to start crawling from
845
+ include_paths (Optional[List[str]]): Patterns of URLs to include
846
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
847
+ max_depth (Optional[int]): Maximum crawl depth
848
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
849
+ limit (Optional[int]): Maximum pages to crawl
850
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
851
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
852
+ allow_external_links (Optional[bool]): Follow external domain links
853
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
854
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
855
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
856
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
857
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
858
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
859
+ delay (Optional[int]): Delay in seconds between scrapes
860
+ allow_subdomains (Optional[bool]): Follow subdomains
861
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
862
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
863
+ **kwargs: Additional parameters to pass to the API
864
+
865
+ Returns:
866
+ CrawlResponse with:
867
+ * success - Whether crawl started successfully
868
+ * id - Unique identifier for the crawl job
869
+ * url - Status check URL for the crawl
870
+ * error - Error message if start failed
871
+
872
+ Raises:
873
+ Exception: If crawl initiation fails
874
+ """
875
+ # Validate any additional kwargs
876
+ self._validate_kwargs(kwargs, "async_crawl_url")
877
+
878
+ crawl_params = {}
879
+
880
+ # Add individual parameters
881
+ if include_paths is not None:
882
+ crawl_params['includePaths'] = include_paths
883
+ if exclude_paths is not None:
884
+ crawl_params['excludePaths'] = exclude_paths
885
+ if max_depth is not None:
886
+ crawl_params['maxDepth'] = max_depth
887
+ if max_discovery_depth is not None:
888
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
889
+ if limit is not None:
890
+ crawl_params['limit'] = limit
891
+ if crawl_entire_domain is not None:
892
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
893
+ elif allow_backward_links is not None:
894
+ crawl_params['allowBackwardLinks'] = allow_backward_links
895
+ if allow_external_links is not None:
896
+ crawl_params['allowExternalLinks'] = allow_external_links
897
+ if ignore_sitemap is not None:
898
+ crawl_params['ignoreSitemap'] = ignore_sitemap
899
+ if scrape_options is not None:
900
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
901
+ if webhook is not None:
902
+ crawl_params['webhook'] = webhook
903
+ if deduplicate_similar_urls is not None:
904
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
905
+ if ignore_query_parameters is not None:
906
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
907
+ if regex_on_full_url is not None:
908
+ crawl_params['regexOnFullURL'] = regex_on_full_url
909
+ if delay is not None:
910
+ crawl_params['delay'] = delay
911
+ if allow_subdomains is not None:
912
+ crawl_params['allowSubdomains'] = allow_subdomains
913
+ if max_concurrency is not None:
914
+ crawl_params['maxConcurrency'] = max_concurrency
915
+
916
+ # Add any additional kwargs
917
+ crawl_params.update(kwargs)
918
+
919
+ # Create final params object
920
+ final_params = CrawlParams(**crawl_params)
921
+ params_dict = final_params.dict(exclude_none=True)
922
+ params_dict['url'] = url
923
+ params_dict['origin'] = f"python-sdk@{version}"
924
+
925
+ # Make request
926
+ headers = self._prepare_headers(idempotency_key)
927
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
928
+
929
+ if response.status_code == 200:
930
+ try:
931
+ return CrawlResponse(**response.json())
932
+ except:
933
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
934
+ else:
935
+ self._handle_error(response, 'start crawl job')
936
+
937
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
938
+ """
939
+ Check the status and results of a crawl job.
940
+
941
+ Args:
942
+ id: Unique identifier for the crawl job
943
+
944
+ Returns:
945
+ CrawlStatusResponse containing:
946
+
947
+ Status Information:
948
+ * status - Current state (scraping/completed/failed/cancelled)
949
+ * completed - Number of pages crawled
950
+ * total - Total pages to crawl
951
+ * creditsUsed - API credits consumed
952
+ * expiresAt - Data expiration timestamp
953
+
954
+ Results:
955
+ * data - List of crawled documents
956
+ * next - URL for next page of results (if paginated)
957
+ * success - Whether status check succeeded
958
+ * error - Error message if failed
959
+
960
+ Raises:
961
+ Exception: If status check fails
962
+ """
963
+ endpoint = f'/v1/crawl/{id}'
964
+
965
+ headers = self._prepare_headers()
966
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
967
+ if response.status_code == 200:
968
+ try:
969
+ status_data = response.json()
970
+ except:
971
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
972
+ if status_data['status'] == 'completed':
973
+ if 'data' in status_data:
974
+ data = status_data['data']
975
+ while 'next' in status_data:
976
+ if len(status_data['data']) == 0:
977
+ break
978
+ next_url = status_data.get('next')
979
+ if not next_url:
980
+ logger.warning("Expected 'next' URL is missing.")
981
+ break
982
+ try:
983
+ status_response = self._get_request(next_url, headers)
984
+ if status_response.status_code != 200:
985
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
986
+ break
987
+ try:
988
+ next_data = status_response.json()
989
+ except:
990
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
991
+ data.extend(next_data.get('data', []))
992
+ status_data = next_data
993
+ except Exception as e:
994
+ logger.error(f"Error during pagination request: {e}")
995
+ break
996
+ status_data['data'] = data
997
+
998
+ response = {
999
+ 'status': status_data.get('status'),
1000
+ 'total': status_data.get('total'),
1001
+ 'completed': status_data.get('completed'),
1002
+ 'creditsUsed': status_data.get('creditsUsed'),
1003
+ 'expiresAt': status_data.get('expiresAt'),
1004
+ 'data': status_data.get('data')
1005
+ }
1006
+
1007
+ if 'error' in status_data:
1008
+ response['error'] = status_data['error']
1009
+
1010
+ if 'next' in status_data:
1011
+ response['next'] = status_data['next']
1012
+
1013
+ return CrawlStatusResponse(
1014
+ success=False if 'error' in status_data else True,
1015
+ **response
1016
+ )
1017
+ else:
1018
+ self._handle_error(response, 'check crawl status')
1019
+
1020
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
1021
+ """
1022
+ Returns information about crawl errors.
1023
+
1024
+ Args:
1025
+ id (str): The ID of the crawl job
1026
+
1027
+ Returns:
1028
+ CrawlErrorsResponse containing:
1029
+ * errors (List[Dict[str, str]]): List of errors with fields:
1030
+ - id (str): Error ID
1031
+ - timestamp (str): When the error occurred
1032
+ - url (str): URL that caused the error
1033
+ - error (str): Error message
1034
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1035
+
1036
+ Raises:
1037
+ Exception: If error check fails
1038
+ """
1039
+ headers = self._prepare_headers()
1040
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1041
+ if response.status_code == 200:
1042
+ try:
1043
+ return CrawlErrorsResponse(**response.json())
1044
+ except:
1045
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1046
+ else:
1047
+ self._handle_error(response, "check crawl errors")
1048
+
1049
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
1050
+ """
1051
+ Cancel an asynchronous crawl job.
1052
+
1053
+ Args:
1054
+ id (str): The ID of the crawl job to cancel
1055
+
1056
+ Returns:
1057
+ Dict[str, Any] containing:
1058
+ * success (bool): Whether cancellation was successful
1059
+ * error (str, optional): Error message if cancellation failed
1060
+
1061
+ Raises:
1062
+ Exception: If cancellation fails
1063
+ """
1064
+ headers = self._prepare_headers()
1065
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1066
+ if response.status_code == 200:
1067
+ try:
1068
+ return response.json()
1069
+ except:
1070
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1071
+ else:
1072
+ self._handle_error(response, "cancel crawl job")
1073
+
1074
+ def crawl_url_and_watch(
1075
+ self,
1076
+ url: str,
1077
+ *,
1078
+ include_paths: Optional[List[str]] = None,
1079
+ exclude_paths: Optional[List[str]] = None,
1080
+ max_depth: Optional[int] = None,
1081
+ max_discovery_depth: Optional[int] = None,
1082
+ limit: Optional[int] = None,
1083
+ allow_backward_links: Optional[bool] = None,
1084
+ crawl_entire_domain: Optional[bool] = None,
1085
+ allow_external_links: Optional[bool] = None,
1086
+ ignore_sitemap: Optional[bool] = None,
1087
+ scrape_options: Optional[ScrapeOptions] = None,
1088
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1089
+ deduplicate_similar_urls: Optional[bool] = None,
1090
+ ignore_query_parameters: Optional[bool] = None,
1091
+ regex_on_full_url: Optional[bool] = None,
1092
+ delay: Optional[int] = None,
1093
+ allow_subdomains: Optional[bool] = None,
1094
+ max_concurrency: Optional[int] = None,
1095
+ idempotency_key: Optional[str] = None,
1096
+ **kwargs
1097
+ ) -> 'CrawlWatcher':
1098
+ """
1099
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1100
+
1101
+ Args:
1102
+ url (str): Target URL to start crawling from
1103
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1104
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1105
+ max_depth (Optional[int]): Maximum crawl depth
1106
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1107
+ limit (Optional[int]): Maximum pages to crawl
1108
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1109
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
1110
+ allow_external_links (Optional[bool]): Follow external domain links
1111
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1112
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1113
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1114
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1115
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1116
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1117
+ delay (Optional[int]): Delay in seconds between scrapes
1118
+ allow_subdomains (Optional[bool]): Follow subdomains
1119
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1120
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1121
+ **kwargs: Additional parameters to pass to the API
1122
+
1123
+ Returns:
1124
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1125
+
1126
+ Raises:
1127
+ Exception: If crawl job fails to start
1128
+ """
1129
+ crawl_response = self.async_crawl_url(
1130
+ url,
1131
+ include_paths=include_paths,
1132
+ exclude_paths=exclude_paths,
1133
+ max_depth=max_depth,
1134
+ max_discovery_depth=max_discovery_depth,
1135
+ limit=limit,
1136
+ allow_backward_links=allow_backward_links,
1137
+ allow_external_links=allow_external_links,
1138
+ ignore_sitemap=ignore_sitemap,
1139
+ scrape_options=scrape_options,
1140
+ webhook=webhook,
1141
+ deduplicate_similar_urls=deduplicate_similar_urls,
1142
+ ignore_query_parameters=ignore_query_parameters,
1143
+ regex_on_full_url=regex_on_full_url,
1144
+ delay=delay,
1145
+ allow_subdomains=allow_subdomains,
1146
+ max_concurrency=max_concurrency,
1147
+ idempotency_key=idempotency_key,
1148
+ **kwargs
1149
+ )
1150
+ if crawl_response.success and crawl_response.id:
1151
+ return CrawlWatcher(crawl_response.id, self)
1152
+ else:
1153
+ raise Exception("Crawl job failed to start")
1154
+
1155
+ def map_url(
1156
+ self,
1157
+ url: str,
1158
+ *,
1159
+ search: Optional[str] = None,
1160
+ ignore_sitemap: Optional[bool] = None,
1161
+ include_subdomains: Optional[bool] = None,
1162
+ sitemap_only: Optional[bool] = None,
1163
+ limit: Optional[int] = None,
1164
+ timeout: Optional[int] = None,
1165
+ use_index: Optional[bool] = None,
1166
+ **kwargs) -> MapResponse:
1167
+ """
1168
+ Map and discover links from a URL.
1169
+
1170
+ Args:
1171
+ url (str): Target URL to map
1172
+ search (Optional[str]): Filter pattern for URLs
1173
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1174
+ include_subdomains (Optional[bool]): Include subdomain links
1175
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1176
+ limit (Optional[int]): Maximum URLs to return
1177
+ timeout (Optional[int]): Request timeout in milliseconds
1178
+ **kwargs: Additional parameters to pass to the API
1179
+
1180
+ Returns:
1181
+ MapResponse: Response containing:
1182
+ * success (bool): Whether request succeeded
1183
+ * links (List[str]): Discovered URLs
1184
+ * error (Optional[str]): Error message if any
1185
+
1186
+ Raises:
1187
+ Exception: If mapping fails or response cannot be parsed
1188
+ """
1189
+ # Validate any additional kwargs
1190
+ self._validate_kwargs(kwargs, "map_url")
1191
+
1192
+ # Build map parameters
1193
+ map_params = {}
1194
+
1195
+ # Add individual parameters
1196
+ if search is not None:
1197
+ map_params['search'] = search
1198
+ if ignore_sitemap is not None:
1199
+ map_params['ignoreSitemap'] = ignore_sitemap
1200
+ if include_subdomains is not None:
1201
+ map_params['includeSubdomains'] = include_subdomains
1202
+ if sitemap_only is not None:
1203
+ map_params['sitemapOnly'] = sitemap_only
1204
+ if limit is not None:
1205
+ map_params['limit'] = limit
1206
+ if timeout is not None:
1207
+ map_params['timeout'] = timeout
1208
+ if use_index is not None:
1209
+ map_params['useIndex'] = use_index
1210
+
1211
+ # Add any additional kwargs
1212
+ map_params.update(kwargs)
1213
+
1214
+ # Create final params object
1215
+ final_params = MapParams(**map_params)
1216
+ params_dict = final_params.dict(exclude_none=True)
1217
+ params_dict['url'] = url
1218
+ params_dict['origin'] = f"python-sdk@{version}"
1219
+
1220
+ # Make request
1221
+ response = requests.post(
1222
+ f"{self.api_url}/v1/map",
1223
+ headers={"Authorization": f"Bearer {self.api_key}"},
1224
+ json=params_dict
1225
+ )
1226
+
1227
+ if response.status_code == 200:
1228
+ try:
1229
+ response_json = response.json()
1230
+ if response_json.get('success') and 'links' in response_json:
1231
+ return MapResponse(**response_json)
1232
+ elif "error" in response_json:
1233
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1234
+ else:
1235
+ raise Exception(f'Map failed. Error: {response_json}')
1236
+ except ValueError:
1237
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1238
+ else:
1239
+ self._handle_error(response, 'map')
1240
+
1241
+ def batch_scrape_urls(
1242
+ self,
1243
+ urls: List[str],
1244
+ *,
1245
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1246
+ headers: Optional[Dict[str, str]] = None,
1247
+ include_tags: Optional[List[str]] = None,
1248
+ exclude_tags: Optional[List[str]] = None,
1249
+ only_main_content: Optional[bool] = None,
1250
+ wait_for: Optional[int] = None,
1251
+ timeout: Optional[int] = None,
1252
+ location: Optional[LocationConfig] = None,
1253
+ mobile: Optional[bool] = None,
1254
+ skip_tls_verification: Optional[bool] = None,
1255
+ remove_base64_images: Optional[bool] = None,
1256
+ block_ads: Optional[bool] = None,
1257
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1258
+ extract: Optional[JsonConfig] = None,
1259
+ json_options: Optional[JsonConfig] = None,
1260
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1261
+ agent: Optional[AgentOptions] = None,
1262
+ poll_interval: Optional[int] = 2,
1263
+ max_concurrency: Optional[int] = None,
1264
+ idempotency_key: Optional[str] = None,
1265
+ **kwargs
1266
+ ) -> BatchScrapeStatusResponse:
1267
+ """
1268
+ Batch scrape multiple URLs and monitor until completion.
1269
+
1270
+ Args:
1271
+ urls (List[str]): URLs to scrape
1272
+ formats (Optional[List[Literal]]): Content formats to retrieve
1273
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1274
+ include_tags (Optional[List[str]]): HTML tags to include
1275
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1276
+ only_main_content (Optional[bool]): Extract main content only
1277
+ wait_for (Optional[int]): Wait time in milliseconds
1278
+ timeout (Optional[int]): Request timeout in milliseconds
1279
+ location (Optional[LocationConfig]): Location configuration
1280
+ mobile (Optional[bool]): Use mobile user agent
1281
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1282
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1283
+ block_ads (Optional[bool]): Block advertisements
1284
+ proxy (Optional[Literal]): Proxy type to use
1285
+ extract (Optional[JsonConfig]): Content extraction config
1286
+ json_options (Optional[JsonConfig]): JSON extraction config
1287
+ actions (Optional[List[Union]]): Actions to perform
1288
+ agent (Optional[AgentOptions]): Agent configuration
1289
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1290
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1291
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1292
+ **kwargs: Additional parameters to pass to the API
1293
+
1294
+ Returns:
1295
+ BatchScrapeStatusResponse with:
1296
+ * Scraping status and progress
1297
+ * Scraped content for each URL
1298
+ * Success/error information
1299
+
1300
+ Raises:
1301
+ Exception: If batch scrape fails
1302
+ """
1303
+ # Validate any additional kwargs
1304
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1305
+
1306
+ scrape_params = {}
1307
+
1308
+ # Add individual parameters
1309
+ if formats is not None:
1310
+ scrape_params['formats'] = formats
1311
+ if headers is not None:
1312
+ scrape_params['headers'] = headers
1313
+ if include_tags is not None:
1314
+ scrape_params['includeTags'] = include_tags
1315
+ if exclude_tags is not None:
1316
+ scrape_params['excludeTags'] = exclude_tags
1317
+ if only_main_content is not None:
1318
+ scrape_params['onlyMainContent'] = only_main_content
1319
+ if wait_for is not None:
1320
+ scrape_params['waitFor'] = wait_for
1321
+ if timeout is not None:
1322
+ scrape_params['timeout'] = timeout
1323
+ if location is not None:
1324
+ scrape_params['location'] = location.dict(exclude_none=True)
1325
+ if mobile is not None:
1326
+ scrape_params['mobile'] = mobile
1327
+ if skip_tls_verification is not None:
1328
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1329
+ if remove_base64_images is not None:
1330
+ scrape_params['removeBase64Images'] = remove_base64_images
1331
+ if block_ads is not None:
1332
+ scrape_params['blockAds'] = block_ads
1333
+ if proxy is not None:
1334
+ scrape_params['proxy'] = proxy
1335
+ if extract is not None:
1336
+ extract = self._ensure_schema_dict(extract)
1337
+ if isinstance(extract, dict) and "schema" in extract:
1338
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1339
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1340
+ if json_options is not None:
1341
+ json_options = self._ensure_schema_dict(json_options)
1342
+ if isinstance(json_options, dict) and "schema" in json_options:
1343
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1344
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1345
+ if actions is not None:
1346
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1347
+ if agent is not None:
1348
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1349
+ if max_concurrency is not None:
1350
+ scrape_params['maxConcurrency'] = max_concurrency
1351
+
1352
+ # Add any additional kwargs
1353
+ scrape_params.update(kwargs)
1354
+
1355
+ # Create final params object
1356
+ final_params = ScrapeParams(**scrape_params)
1357
+ params_dict = final_params.dict(exclude_none=True)
1358
+ params_dict['urls'] = urls
1359
+ params_dict['origin'] = f"python-sdk@{version}"
1360
+
1361
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1362
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1363
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1364
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1365
+
1366
+ # Make request
1367
+ headers = self._prepare_headers(idempotency_key)
1368
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1369
+
1370
+ if response.status_code == 200:
1371
+ try:
1372
+ id = response.json().get('id')
1373
+ except:
1374
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1375
+ return self._monitor_job_status(id, headers, poll_interval)
1376
+ else:
1377
+ self._handle_error(response, 'start batch scrape job')
1378
+
1379
+ def async_batch_scrape_urls(
1380
+ self,
1381
+ urls: List[str],
1382
+ *,
1383
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1384
+ headers: Optional[Dict[str, str]] = None,
1385
+ include_tags: Optional[List[str]] = None,
1386
+ exclude_tags: Optional[List[str]] = None,
1387
+ only_main_content: Optional[bool] = None,
1388
+ wait_for: Optional[int] = None,
1389
+ timeout: Optional[int] = None,
1390
+ location: Optional[LocationConfig] = None,
1391
+ mobile: Optional[bool] = None,
1392
+ skip_tls_verification: Optional[bool] = None,
1393
+ remove_base64_images: Optional[bool] = None,
1394
+ block_ads: Optional[bool] = None,
1395
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1396
+ extract: Optional[JsonConfig] = None,
1397
+ json_options: Optional[JsonConfig] = None,
1398
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1399
+ agent: Optional[AgentOptions] = None,
1400
+ max_concurrency: Optional[int] = None,
1401
+ idempotency_key: Optional[str] = None,
1402
+ **kwargs
1403
+ ) -> BatchScrapeResponse:
1404
+ """
1405
+ Initiate a batch scrape job asynchronously.
1406
+
1407
+ Args:
1408
+ urls (List[str]): URLs to scrape
1409
+ formats (Optional[List[Literal]]): Content formats to retrieve
1410
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1411
+ include_tags (Optional[List[str]]): HTML tags to include
1412
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1413
+ only_main_content (Optional[bool]): Extract main content only
1414
+ wait_for (Optional[int]): Wait time in milliseconds
1415
+ timeout (Optional[int]): Request timeout in milliseconds
1416
+ location (Optional[LocationConfig]): Location configuration
1417
+ mobile (Optional[bool]): Use mobile user agent
1418
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1419
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1420
+ block_ads (Optional[bool]): Block advertisements
1421
+ proxy (Optional[Literal]): Proxy type to use
1422
+ extract (Optional[JsonConfig]): Content extraction config
1423
+ json_options (Optional[JsonConfig]): JSON extraction config
1424
+ actions (Optional[List[Union]]): Actions to perform
1425
+ agent (Optional[AgentOptions]): Agent configuration
1426
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1427
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1428
+ **kwargs: Additional parameters to pass to the API
1429
+
1430
+ Returns:
1431
+ BatchScrapeResponse with:
1432
+ * success - Whether job started successfully
1433
+ * id - Unique identifier for the job
1434
+ * url - Status check URL
1435
+ * error - Error message if start failed
1436
+
1437
+ Raises:
1438
+ Exception: If job initiation fails
1439
+ """
1440
+ # Validate any additional kwargs
1441
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1442
+
1443
+ scrape_params = {}
1444
+
1445
+ # Add individual parameters
1446
+ if formats is not None:
1447
+ scrape_params['formats'] = formats
1448
+ if headers is not None:
1449
+ scrape_params['headers'] = headers
1450
+ if include_tags is not None:
1451
+ scrape_params['includeTags'] = include_tags
1452
+ if exclude_tags is not None:
1453
+ scrape_params['excludeTags'] = exclude_tags
1454
+ if only_main_content is not None:
1455
+ scrape_params['onlyMainContent'] = only_main_content
1456
+ if wait_for is not None:
1457
+ scrape_params['waitFor'] = wait_for
1458
+ if timeout is not None:
1459
+ scrape_params['timeout'] = timeout
1460
+ if location is not None:
1461
+ scrape_params['location'] = location.dict(exclude_none=True)
1462
+ if mobile is not None:
1463
+ scrape_params['mobile'] = mobile
1464
+ if skip_tls_verification is not None:
1465
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1466
+ if remove_base64_images is not None:
1467
+ scrape_params['removeBase64Images'] = remove_base64_images
1468
+ if block_ads is not None:
1469
+ scrape_params['blockAds'] = block_ads
1470
+ if proxy is not None:
1471
+ scrape_params['proxy'] = proxy
1472
+ if extract is not None:
1473
+ extract = self._ensure_schema_dict(extract)
1474
+ if isinstance(extract, dict) and "schema" in extract:
1475
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1476
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1477
+ if json_options is not None:
1478
+ json_options = self._ensure_schema_dict(json_options)
1479
+ if isinstance(json_options, dict) and "schema" in json_options:
1480
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1481
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1482
+ if actions is not None:
1483
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1484
+ if agent is not None:
1485
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1486
+ if max_concurrency is not None:
1487
+ scrape_params['maxConcurrency'] = max_concurrency
1488
+
1489
+ # Add any additional kwargs
1490
+ scrape_params.update(kwargs)
1491
+
1492
+ # Create final params object
1493
+ final_params = ScrapeParams(**scrape_params)
1494
+ params_dict = final_params.dict(exclude_none=True)
1495
+ params_dict['urls'] = urls
1496
+ params_dict['origin'] = f"python-sdk@{version}"
1497
+
1498
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1499
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1500
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1501
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1502
+
1503
+ # Make request
1504
+ headers = self._prepare_headers(idempotency_key)
1505
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1506
+
1507
+ if response.status_code == 200:
1508
+ try:
1509
+ return BatchScrapeResponse(**response.json())
1510
+ except:
1511
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1512
+ else:
1513
+ self._handle_error(response, 'start batch scrape job')
1514
+
1515
+ def batch_scrape_urls_and_watch(
1516
+ self,
1517
+ urls: List[str],
1518
+ *,
1519
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1520
+ headers: Optional[Dict[str, str]] = None,
1521
+ include_tags: Optional[List[str]] = None,
1522
+ exclude_tags: Optional[List[str]] = None,
1523
+ only_main_content: Optional[bool] = None,
1524
+ wait_for: Optional[int] = None,
1525
+ timeout: Optional[int] = None,
1526
+ location: Optional[LocationConfig] = None,
1527
+ mobile: Optional[bool] = None,
1528
+ skip_tls_verification: Optional[bool] = None,
1529
+ remove_base64_images: Optional[bool] = None,
1530
+ block_ads: Optional[bool] = None,
1531
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1532
+ extract: Optional[JsonConfig] = None,
1533
+ json_options: Optional[JsonConfig] = None,
1534
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1535
+ agent: Optional[AgentOptions] = None,
1536
+ max_concurrency: Optional[int] = None,
1537
+ idempotency_key: Optional[str] = None,
1538
+ **kwargs
1539
+ ) -> 'CrawlWatcher':
1540
+ """
1541
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1542
+
1543
+ Args:
1544
+ urls (List[str]): URLs to scrape
1545
+ formats (Optional[List[Literal]]): Content formats to retrieve
1546
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1547
+ include_tags (Optional[List[str]]): HTML tags to include
1548
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1549
+ only_main_content (Optional[bool]): Extract main content only
1550
+ wait_for (Optional[int]): Wait time in milliseconds
1551
+ timeout (Optional[int]): Request timeout in milliseconds
1552
+ location (Optional[LocationConfig]): Location configuration
1553
+ mobile (Optional[bool]): Use mobile user agent
1554
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1555
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1556
+ block_ads (Optional[bool]): Block advertisements
1557
+ proxy (Optional[Literal]): Proxy type to use
1558
+ extract (Optional[JsonConfig]): Content extraction config
1559
+ json_options (Optional[JsonConfig]): JSON extraction config
1560
+ actions (Optional[List[Union]]): Actions to perform
1561
+ agent (Optional[AgentOptions]): Agent configuration
1562
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1563
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1564
+ **kwargs: Additional parameters to pass to the API
1565
+
1566
+ Returns:
1567
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1568
+
1569
+ Raises:
1570
+ Exception: If batch scrape job fails to start
1571
+ """
1572
+ # Validate any additional kwargs
1573
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1574
+
1575
+ scrape_params = {}
1576
+
1577
+ # Add individual parameters
1578
+ if formats is not None:
1579
+ scrape_params['formats'] = formats
1580
+ if headers is not None:
1581
+ scrape_params['headers'] = headers
1582
+ if include_tags is not None:
1583
+ scrape_params['includeTags'] = include_tags
1584
+ if exclude_tags is not None:
1585
+ scrape_params['excludeTags'] = exclude_tags
1586
+ if only_main_content is not None:
1587
+ scrape_params['onlyMainContent'] = only_main_content
1588
+ if wait_for is not None:
1589
+ scrape_params['waitFor'] = wait_for
1590
+ if timeout is not None:
1591
+ scrape_params['timeout'] = timeout
1592
+ if location is not None:
1593
+ scrape_params['location'] = location.dict(exclude_none=True)
1594
+ if mobile is not None:
1595
+ scrape_params['mobile'] = mobile
1596
+ if skip_tls_verification is not None:
1597
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1598
+ if remove_base64_images is not None:
1599
+ scrape_params['removeBase64Images'] = remove_base64_images
1600
+ if block_ads is not None:
1601
+ scrape_params['blockAds'] = block_ads
1602
+ if proxy is not None:
1603
+ scrape_params['proxy'] = proxy
1604
+ if extract is not None:
1605
+ extract = self._ensure_schema_dict(extract)
1606
+ if isinstance(extract, dict) and "schema" in extract:
1607
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1608
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1609
+ if json_options is not None:
1610
+ json_options = self._ensure_schema_dict(json_options)
1611
+ if isinstance(json_options, dict) and "schema" in json_options:
1612
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1613
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1614
+ if actions is not None:
1615
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1616
+ if agent is not None:
1617
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1618
+ if max_concurrency is not None:
1619
+ scrape_params['maxConcurrency'] = max_concurrency
1620
+
1621
+ # Add any additional kwargs
1622
+ scrape_params.update(kwargs)
1623
+
1624
+ # Create final params object
1625
+ final_params = ScrapeParams(**scrape_params)
1626
+ params_dict = final_params.dict(exclude_none=True)
1627
+ params_dict['urls'] = urls
1628
+ params_dict['origin'] = f"python-sdk@{version}"
1629
+
1630
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1631
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1632
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1633
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1634
+
1635
+ # Make request
1636
+ headers = self._prepare_headers(idempotency_key)
1637
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1638
+
1639
+ if response.status_code == 200:
1640
+ try:
1641
+ crawl_response = BatchScrapeResponse(**response.json())
1642
+ if crawl_response.success and crawl_response.id:
1643
+ return CrawlWatcher(crawl_response.id, self)
1644
+ else:
1645
+ raise Exception("Batch scrape job failed to start")
1646
+ except:
1647
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1648
+ else:
1649
+ self._handle_error(response, 'start batch scrape job')
1650
+
1651
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1652
+ """
1653
+ Check the status of a batch scrape job using the Firecrawl API.
1654
+
1655
+ Args:
1656
+ id (str): The ID of the batch scrape job.
1657
+
1658
+ Returns:
1659
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1660
+
1661
+ Raises:
1662
+ Exception: If the status check request fails.
1663
+ """
1664
+ endpoint = f'/v1/batch/scrape/{id}'
1665
+
1666
+ headers = self._prepare_headers()
1667
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1668
+ if response.status_code == 200:
1669
+ try:
1670
+ status_data = response.json()
1671
+ except:
1672
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1673
+ if status_data['status'] == 'completed':
1674
+ if 'data' in status_data:
1675
+ data = status_data['data']
1676
+ while 'next' in status_data:
1677
+ if len(status_data['data']) == 0:
1678
+ break
1679
+ next_url = status_data.get('next')
1680
+ if not next_url:
1681
+ logger.warning("Expected 'next' URL is missing.")
1682
+ break
1683
+ try:
1684
+ status_response = self._get_request(next_url, headers)
1685
+ if status_response.status_code != 200:
1686
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1687
+ break
1688
+ try:
1689
+ next_data = status_response.json()
1690
+ except:
1691
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1692
+ data.extend(next_data.get('data', []))
1693
+ status_data = next_data
1694
+ except Exception as e:
1695
+ logger.error(f"Error during pagination request: {e}")
1696
+ break
1697
+ status_data['data'] = data
1698
+
1699
+ return BatchScrapeStatusResponse(**{
1700
+ 'success': False if 'error' in status_data else True,
1701
+ 'status': status_data.get('status'),
1702
+ 'total': status_data.get('total'),
1703
+ 'completed': status_data.get('completed'),
1704
+ 'creditsUsed': status_data.get('creditsUsed'),
1705
+ 'expiresAt': status_data.get('expiresAt'),
1706
+ 'data': status_data.get('data'),
1707
+ 'next': status_data.get('next'),
1708
+ 'error': status_data.get('error')
1709
+ })
1710
+ else:
1711
+ self._handle_error(response, 'check batch scrape status')
1712
+
1713
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1714
+ """
1715
+ Returns information about batch scrape errors.
1716
+
1717
+ Args:
1718
+ id (str): The ID of the crawl job.
1719
+
1720
+ Returns:
1721
+ CrawlErrorsResponse containing:
1722
+ * errors (List[Dict[str, str]]): List of errors with fields:
1723
+ * id (str): Error ID
1724
+ * timestamp (str): When the error occurred
1725
+ * url (str): URL that caused the error
1726
+ * error (str): Error message
1727
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1728
+
1729
+ Raises:
1730
+ Exception: If the error check request fails
1731
+ """
1732
+ headers = self._prepare_headers()
1733
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1734
+ if response.status_code == 200:
1735
+ try:
1736
+ return CrawlErrorsResponse(**response.json())
1737
+ except:
1738
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1739
+ else:
1740
+ self._handle_error(response, "check batch scrape errors")
1741
+
1742
+ def extract(
1743
+ self,
1744
+ urls: Optional[List[str]] = None,
1745
+ *,
1746
+ prompt: Optional[str] = None,
1747
+ schema: Optional[Any] = None,
1748
+ system_prompt: Optional[str] = None,
1749
+ allow_external_links: Optional[bool] = False,
1750
+ enable_web_search: Optional[bool] = False,
1751
+ show_sources: Optional[bool] = False,
1752
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1753
+ """
1754
+ Extract structured information from URLs.
1755
+
1756
+ Args:
1757
+ urls (Optional[List[str]]): URLs to extract from
1758
+ prompt (Optional[str]): Custom extraction prompt
1759
+ schema (Optional[Any]): JSON schema/Pydantic model
1760
+ system_prompt (Optional[str]): System context
1761
+ allow_external_links (Optional[bool]): Follow external links
1762
+ enable_web_search (Optional[bool]): Enable web search
1763
+ show_sources (Optional[bool]): Include source URLs
1764
+ agent (Optional[Dict[str, Any]]): Agent configuration
1765
+
1766
+ Returns:
1767
+ ExtractResponse[Any] with:
1768
+ * success (bool): Whether request succeeded
1769
+ * data (Optional[Any]): Extracted data matching schema
1770
+ * error (Optional[str]): Error message if any
1771
+
1772
+ Raises:
1773
+ ValueError: If prompt/schema missing or extraction fails
1774
+ """
1775
+ headers = self._prepare_headers()
1776
+
1777
+ if not prompt and not schema:
1778
+ raise ValueError("Either prompt or schema is required")
1779
+
1780
+ if not urls and not prompt:
1781
+ raise ValueError("Either urls or prompt is required")
1782
+
1783
+ if schema:
1784
+ schema = self._ensure_schema_dict(schema)
1785
+
1786
+ request_data = {
1787
+ 'urls': urls or [],
1788
+ 'allowExternalLinks': allow_external_links,
1789
+ 'enableWebSearch': enable_web_search,
1790
+ 'showSources': show_sources,
1791
+ 'schema': schema,
1792
+ 'origin': f'python-sdk@{get_version()}'
1793
+ }
1794
+
1795
+ # Only add prompt and systemPrompt if they exist
1796
+ if prompt:
1797
+ request_data['prompt'] = prompt
1798
+ if system_prompt:
1799
+ request_data['systemPrompt'] = system_prompt
1800
+
1801
+ if agent:
1802
+ request_data['agent'] = agent
1803
+
1804
+ try:
1805
+ # Send the initial extract request
1806
+ response = self._post_request(
1807
+ f'{self.api_url}/v1/extract',
1808
+ request_data,
1809
+ headers
1810
+ )
1811
+ if response.status_code == 200:
1812
+ try:
1813
+ data = response.json()
1814
+ except:
1815
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1816
+ if data['success']:
1817
+ job_id = data.get('id')
1818
+ if not job_id:
1819
+ raise Exception('Job ID not returned from extract request.')
1820
+
1821
+ # Poll for the extract status
1822
+ while True:
1823
+ status_response = self._get_request(
1824
+ f'{self.api_url}/v1/extract/{job_id}',
1825
+ headers
1826
+ )
1827
+ if status_response.status_code == 200:
1828
+ try:
1829
+ status_data = status_response.json()
1830
+ except:
1831
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1832
+ if status_data['status'] == 'completed':
1833
+ return ExtractResponse(**status_data)
1834
+ elif status_data['status'] in ['failed', 'cancelled']:
1835
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1836
+ else:
1837
+ self._handle_error(status_response, "extract-status")
1838
+
1839
+ time.sleep(2) # Polling interval
1840
+ else:
1841
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1842
+ else:
1843
+ self._handle_error(response, "extract")
1844
+ except Exception as e:
1845
+ raise ValueError(str(e), 500)
1846
+
1847
+ return ExtractResponse(success=False, error="Internal server error.")
1848
+
1849
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1850
+ """
1851
+ Retrieve the status of an extract job.
1852
+
1853
+ Args:
1854
+ job_id (str): The ID of the extract job.
1855
+
1856
+ Returns:
1857
+ ExtractResponse[Any]: The status of the extract job.
1858
+
1859
+ Raises:
1860
+ ValueError: If there is an error retrieving the status.
1861
+ """
1862
+ headers = self._prepare_headers()
1863
+ try:
1864
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1865
+ if response.status_code == 200:
1866
+ try:
1867
+ return ExtractResponse(**response.json())
1868
+ except:
1869
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1870
+ else:
1871
+ self._handle_error(response, "get extract status")
1872
+ except Exception as e:
1873
+ raise ValueError(str(e), 500)
1874
+
1875
+ def async_extract(
1876
+ self,
1877
+ urls: Optional[List[str]] = None,
1878
+ *,
1879
+ prompt: Optional[str] = None,
1880
+ schema: Optional[Any] = None,
1881
+ system_prompt: Optional[str] = None,
1882
+ allow_external_links: Optional[bool] = False,
1883
+ enable_web_search: Optional[bool] = False,
1884
+ show_sources: Optional[bool] = False,
1885
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1886
+ """
1887
+ Initiate an asynchronous extract job.
1888
+
1889
+ Args:
1890
+ urls (List[str]): URLs to extract information from
1891
+ prompt (Optional[str]): Custom extraction prompt
1892
+ schema (Optional[Any]): JSON schema/Pydantic model
1893
+ system_prompt (Optional[str]): System context
1894
+ allow_external_links (Optional[bool]): Follow external links
1895
+ enable_web_search (Optional[bool]): Enable web search
1896
+ show_sources (Optional[bool]): Include source URLs
1897
+ agent (Optional[Dict[str, Any]]): Agent configuration
1898
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1899
+
1900
+ Returns:
1901
+ ExtractResponse[Any] with:
1902
+ * success (bool): Whether request succeeded
1903
+ * data (Optional[Any]): Extracted data matching schema
1904
+ * error (Optional[str]): Error message if any
1905
+
1906
+ Raises:
1907
+ ValueError: If job initiation fails
1908
+ """
1909
+ headers = self._prepare_headers()
1910
+
1911
+ schema = schema
1912
+ if schema:
1913
+ schema = self._ensure_schema_dict(schema)
1914
+
1915
+ request_data = {
1916
+ 'urls': urls,
1917
+ 'allowExternalLinks': allow_external_links,
1918
+ 'enableWebSearch': enable_web_search,
1919
+ 'showSources': show_sources,
1920
+ 'schema': schema,
1921
+ 'origin': f'python-sdk@{version}'
1922
+ }
1923
+
1924
+ if prompt:
1925
+ request_data['prompt'] = prompt
1926
+ if system_prompt:
1927
+ request_data['systemPrompt'] = system_prompt
1928
+ if agent:
1929
+ request_data['agent'] = agent
1930
+
1931
+ try:
1932
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1933
+ if response.status_code == 200:
1934
+ try:
1935
+ return ExtractResponse(**response.json())
1936
+ except:
1937
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1938
+ else:
1939
+ self._handle_error(response, "async extract")
1940
+ except Exception as e:
1941
+ raise ValueError(str(e), 500)
1942
+
1943
+ def generate_llms_text(
1944
+ self,
1945
+ url: str,
1946
+ *,
1947
+ max_urls: Optional[int] = None,
1948
+ show_full_text: Optional[bool] = None,
1949
+ cache: Optional[bool] = None,
1950
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1951
+ """
1952
+ Generate LLMs.txt for a given URL and poll until completion.
1953
+
1954
+ Args:
1955
+ url (str): Target URL to generate LLMs.txt from
1956
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1957
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1958
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
1959
+ experimental_stream (Optional[bool]): Enable experimental streaming
1960
+
1961
+ Returns:
1962
+ GenerateLLMsTextStatusResponse with:
1963
+ * Generated LLMs.txt content
1964
+ * Full version if requested
1965
+ * Generation status
1966
+ * Success/error information
1967
+
1968
+ Raises:
1969
+ Exception: If generation fails
1970
+ """
1971
+ params = GenerateLLMsTextParams(
1972
+ maxUrls=max_urls,
1973
+ showFullText=show_full_text,
1974
+ cache=cache,
1975
+ __experimental_stream=experimental_stream
1976
+ )
1977
+
1978
+ response = self.async_generate_llms_text(
1979
+ url,
1980
+ max_urls=max_urls,
1981
+ show_full_text=show_full_text,
1982
+ cache=cache,
1983
+ experimental_stream=experimental_stream
1984
+ )
1985
+
1986
+ if not response.success or not response.id:
1987
+ return GenerateLLMsTextStatusResponse(
1988
+ success=False,
1989
+ error='Failed to start LLMs.txt generation',
1990
+ status='failed',
1991
+ expiresAt=''
1992
+ )
1993
+
1994
+ job_id = response.id
1995
+ while True:
1996
+ status = self.check_generate_llms_text_status(job_id)
1997
+
1998
+ if status.status == 'completed':
1999
+ return status
2000
+ elif status.status == 'failed':
2001
+ return status
2002
+ elif status.status != 'processing':
2003
+ return GenerateLLMsTextStatusResponse(
2004
+ success=False,
2005
+ error='LLMs.txt generation job terminated unexpectedly',
2006
+ status='failed',
2007
+ expiresAt=''
2008
+ )
2009
+
2010
+ time.sleep(2) # Polling interval
2011
+
2012
+ def async_generate_llms_text(
2013
+ self,
2014
+ url: str,
2015
+ *,
2016
+ max_urls: Optional[int] = None,
2017
+ show_full_text: Optional[bool] = None,
2018
+ cache: Optional[bool] = None,
2019
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
2020
+ """
2021
+ Initiate an asynchronous LLMs.txt generation operation.
2022
+
2023
+ Args:
2024
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2025
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2026
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2027
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2028
+ experimental_stream (Optional[bool]): Enable experimental streaming
2029
+
2030
+ Returns:
2031
+ GenerateLLMsTextResponse: A response containing:
2032
+ * success (bool): Whether the generation initiation was successful
2033
+ * id (str): The unique identifier for the generation job
2034
+ * error (str, optional): Error message if initiation failed
2035
+
2036
+ Raises:
2037
+ Exception: If the generation job initiation fails.
2038
+ """
2039
+ params = GenerateLLMsTextParams(
2040
+ maxUrls=max_urls,
2041
+ showFullText=show_full_text,
2042
+ cache=cache,
2043
+ __experimental_stream=experimental_stream
2044
+ )
2045
+
2046
+ headers = self._prepare_headers()
2047
+ json_data = {'url': url, **params.dict(exclude_none=True)}
2048
+ json_data['origin'] = f"python-sdk@{version}"
2049
+
2050
+ try:
2051
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2052
+ response = req.json()
2053
+ print("json_data", json_data)
2054
+ print("response", response)
2055
+ if response.get('success'):
2056
+ try:
2057
+ return GenerateLLMsTextResponse(**response)
2058
+ except:
2059
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2060
+ else:
2061
+ self._handle_error(response, 'start LLMs.txt generation')
2062
+ except Exception as e:
2063
+ raise ValueError(str(e))
2064
+
2065
+ return GenerateLLMsTextResponse(
2066
+ success=False,
2067
+ error='Internal server error'
2068
+ )
2069
+
2070
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2071
+ """
2072
+ Check the status of a LLMs.txt generation operation.
2073
+
2074
+ Args:
2075
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
2076
+
2077
+ Returns:
2078
+ GenerateLLMsTextStatusResponse: A response containing:
2079
+ * success (bool): Whether the generation was successful
2080
+ * status (str): Status of generation ("processing", "completed", "failed")
2081
+ * data (Dict[str, str], optional): Generated text with fields:
2082
+ * llmstxt (str): Generated LLMs.txt content
2083
+ * llmsfulltxt (str, optional): Full version if requested
2084
+ * error (str, optional): Error message if generation failed
2085
+ * expiresAt (str): When the generated data expires
2086
+
2087
+ Raises:
2088
+ Exception: If the status check fails.
2089
+ """
2090
+ headers = self._prepare_headers()
2091
+ try:
2092
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2093
+ if response.status_code == 200:
2094
+ try:
2095
+ json_data = response.json()
2096
+ return GenerateLLMsTextStatusResponse(**json_data)
2097
+ except Exception as e:
2098
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2099
+ elif response.status_code == 404:
2100
+ raise Exception('LLMs.txt generation job not found')
2101
+ else:
2102
+ self._handle_error(response, 'check LLMs.txt generation status')
2103
+ except Exception as e:
2104
+ raise ValueError(str(e))
2105
+
2106
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2107
+
2108
+ def _prepare_headers(
2109
+ self,
2110
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
2111
+ """
2112
+ Prepare the headers for API requests.
2113
+
2114
+ Args:
2115
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2116
+
2117
+ Returns:
2118
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2119
+ """
2120
+ if idempotency_key:
2121
+ return {
2122
+ 'Content-Type': 'application/json',
2123
+ 'Authorization': f'Bearer {self.api_key}',
2124
+ 'x-idempotency-key': idempotency_key
2125
+ }
2126
+
2127
+ return {
2128
+ 'Content-Type': 'application/json',
2129
+ 'Authorization': f'Bearer {self.api_key}',
2130
+ }
2131
+
2132
+ def _post_request(
2133
+ self,
2134
+ url: str,
2135
+ data: Dict[str, Any],
2136
+ headers: Dict[str, str],
2137
+ retries: int = 3,
2138
+ backoff_factor: float = 0.5) -> requests.Response:
2139
+ """
2140
+ Make a POST request with retries.
2141
+
2142
+ Args:
2143
+ url (str): The URL to send the POST request to.
2144
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2145
+ headers (Dict[str, str]): The headers to include in the POST request.
2146
+ retries (int): Number of retries for the request.
2147
+ backoff_factor (float): Backoff factor for retries.
2148
+
2149
+ Returns:
2150
+ requests.Response: The response from the POST request.
2151
+
2152
+ Raises:
2153
+ requests.RequestException: If the request fails after the specified retries.
2154
+ """
2155
+ for attempt in range(retries):
2156
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2157
+ if response.status_code == 502:
2158
+ time.sleep(backoff_factor * (2 ** attempt))
2159
+ else:
2160
+ return response
2161
+ return response
2162
+
2163
+ def _get_request(
2164
+ self,
2165
+ url: str,
2166
+ headers: Dict[str, str],
2167
+ retries: int = 3,
2168
+ backoff_factor: float = 0.5) -> requests.Response:
2169
+ """
2170
+ Make a GET request with retries.
2171
+
2172
+ Args:
2173
+ url (str): The URL to send the GET request to.
2174
+ headers (Dict[str, str]): The headers to include in the GET request.
2175
+ retries (int): Number of retries for the request.
2176
+ backoff_factor (float): Backoff factor for retries.
2177
+
2178
+ Returns:
2179
+ requests.Response: The response from the GET request.
2180
+
2181
+ Raises:
2182
+ requests.RequestException: If the request fails after the specified retries.
2183
+ """
2184
+ for attempt in range(retries):
2185
+ response = requests.get(url, headers=headers)
2186
+ if response.status_code == 502:
2187
+ time.sleep(backoff_factor * (2 ** attempt))
2188
+ else:
2189
+ return response
2190
+ return response
2191
+
2192
+ def _delete_request(
2193
+ self,
2194
+ url: str,
2195
+ headers: Dict[str, str],
2196
+ retries: int = 3,
2197
+ backoff_factor: float = 0.5) -> requests.Response:
2198
+ """
2199
+ Make a DELETE request with retries.
2200
+
2201
+ Args:
2202
+ url (str): The URL to send the DELETE request to.
2203
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2204
+ retries (int): Number of retries for the request.
2205
+ backoff_factor (float): Backoff factor for retries.
2206
+
2207
+ Returns:
2208
+ requests.Response: The response from the DELETE request.
2209
+
2210
+ Raises:
2211
+ requests.RequestException: If the request fails after the specified retries.
2212
+ """
2213
+ for attempt in range(retries):
2214
+ response = requests.delete(url, headers=headers)
2215
+ if response.status_code == 502:
2216
+ time.sleep(backoff_factor * (2 ** attempt))
2217
+ else:
2218
+ return response
2219
+ return response
2220
+
2221
+ def _monitor_job_status(
2222
+ self,
2223
+ id: str,
2224
+ headers: Dict[str, str],
2225
+ poll_interval: int) -> CrawlStatusResponse:
2226
+ """
2227
+ Monitor the status of a crawl job until completion.
2228
+
2229
+ Args:
2230
+ id (str): The ID of the crawl job.
2231
+ headers (Dict[str, str]): The headers to include in the status check requests.
2232
+ poll_interval (int): Seconds between status checks.
2233
+
2234
+ Returns:
2235
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2236
+
2237
+ Raises:
2238
+ Exception: If the job fails or an error occurs during status checks.
2239
+ """
2240
+ while True:
2241
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2242
+
2243
+ status_response = self._get_request(api_url, headers)
2244
+ if status_response.status_code == 200:
2245
+ try:
2246
+ status_data = status_response.json()
2247
+ except:
2248
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2249
+ if status_data['status'] == 'completed':
2250
+ if 'data' in status_data:
2251
+ data = status_data['data']
2252
+ while 'next' in status_data:
2253
+ if len(status_data['data']) == 0:
2254
+ break
2255
+ status_response = self._get_request(status_data['next'], headers)
2256
+ try:
2257
+ status_data = status_response.json()
2258
+ except:
2259
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2260
+ data.extend(status_data.get('data', []))
2261
+ status_data['data'] = data
2262
+ return CrawlStatusResponse(**status_data)
2263
+ else:
2264
+ raise Exception('Crawl job completed but no data was returned')
2265
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2266
+ poll_interval=max(poll_interval,2)
2267
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2268
+ else:
2269
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2270
+ else:
2271
+ self._handle_error(status_response, 'check crawl status')
2272
+
2273
+ def _handle_error(
2274
+ self,
2275
+ response: requests.Response,
2276
+ action: str) -> None:
2277
+ """
2278
+ Handle errors from API responses.
2279
+
2280
+ Args:
2281
+ response (requests.Response): The response object from the API request.
2282
+ action (str): Description of the action that was being performed.
2283
+
2284
+ Raises:
2285
+ Exception: An exception with a message containing the status code and error details from the response.
2286
+ """
2287
+ try:
2288
+ error_message = response.json().get('error', 'No error message provided.')
2289
+ error_details = response.json().get('details', 'No additional error details provided.')
2290
+ except:
2291
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2292
+
2293
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2294
+
2295
+ # Raise an HTTPError with the custom message and attach the response
2296
+ raise requests.exceptions.HTTPError(message, response=response)
2297
+
2298
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2299
+ """
2300
+ Generate a standardized error message based on HTTP status code.
2301
+
2302
+ Args:
2303
+ status_code (int): The HTTP status code from the response
2304
+ action (str): Description of the action that was being performed
2305
+ error_message (str): The error message from the API response
2306
+ error_details (str): Additional error details from the API response
2307
+
2308
+ Returns:
2309
+ str: A formatted error message
2310
+ """
2311
+ if status_code == 402:
2312
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2313
+ elif status_code == 403:
2314
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2315
+ elif status_code == 408:
2316
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2317
+ elif status_code == 409:
2318
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2319
+ elif status_code == 500:
2320
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2321
+ else:
2322
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2323
+
2324
+ def deep_research(
2325
+ self,
2326
+ query: str,
2327
+ *,
2328
+ max_depth: Optional[int] = None,
2329
+ time_limit: Optional[int] = None,
2330
+ max_urls: Optional[int] = None,
2331
+ analysis_prompt: Optional[str] = None,
2332
+ system_prompt: Optional[str] = None,
2333
+ __experimental_stream_steps: Optional[bool] = None,
2334
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2335
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2336
+ """
2337
+ Initiates a deep research operation on a given query and polls until completion.
2338
+
2339
+ Args:
2340
+ query (str): Research query or topic to investigate
2341
+ max_depth (Optional[int]): Maximum depth of research exploration
2342
+ time_limit (Optional[int]): Time limit in seconds for research
2343
+ max_urls (Optional[int]): Maximum number of URLs to process
2344
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2345
+ system_prompt (Optional[str]): Custom system prompt
2346
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2347
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2348
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2349
+
2350
+ Returns:
2351
+ DeepResearchStatusResponse containing:
2352
+ * success (bool): Whether research completed successfully
2353
+ * status (str): Current state (processing/completed/failed)
2354
+ * error (Optional[str]): Error message if failed
2355
+ * id (str): Unique identifier for the research job
2356
+ * data (Any): Research findings and analysis
2357
+ * sources (List[Dict]): List of discovered sources
2358
+ * activities (List[Dict]): Research progress log
2359
+ * summaries (List[str]): Generated research summaries
2360
+
2361
+ Raises:
2362
+ Exception: If research fails
2363
+ """
2364
+ research_params = {}
2365
+ if max_depth is not None:
2366
+ research_params['maxDepth'] = max_depth
2367
+ if time_limit is not None:
2368
+ research_params['timeLimit'] = time_limit
2369
+ if max_urls is not None:
2370
+ research_params['maxUrls'] = max_urls
2371
+ if analysis_prompt is not None:
2372
+ research_params['analysisPrompt'] = analysis_prompt
2373
+ if system_prompt is not None:
2374
+ research_params['systemPrompt'] = system_prompt
2375
+ if __experimental_stream_steps is not None:
2376
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2377
+ research_params = DeepResearchParams(**research_params)
2378
+
2379
+ response = self.async_deep_research(
2380
+ query,
2381
+ max_depth=max_depth,
2382
+ time_limit=time_limit,
2383
+ max_urls=max_urls,
2384
+ analysis_prompt=analysis_prompt,
2385
+ system_prompt=system_prompt
2386
+ )
2387
+ if not response.get('success') or 'id' not in response:
2388
+ return response
2389
+
2390
+ job_id = response['id']
2391
+ last_activity_count = 0
2392
+ last_source_count = 0
2393
+
2394
+ while True:
2395
+ status = self.check_deep_research_status(job_id)
2396
+
2397
+ if on_activity and 'activities' in status:
2398
+ new_activities = status['activities'][last_activity_count:]
2399
+ for activity in new_activities:
2400
+ on_activity(activity)
2401
+ last_activity_count = len(status['activities'])
2402
+
2403
+ if on_source and 'sources' in status:
2404
+ new_sources = status['sources'][last_source_count:]
2405
+ for source in new_sources:
2406
+ on_source(source)
2407
+ last_source_count = len(status['sources'])
2408
+
2409
+ if status['status'] == 'completed':
2410
+ return status
2411
+ elif status['status'] == 'failed':
2412
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2413
+ elif status['status'] != 'processing':
2414
+ break
2415
+
2416
+ time.sleep(2) # Polling interval
2417
+
2418
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2419
+
2420
+ def async_deep_research(
2421
+ self,
2422
+ query: str,
2423
+ *,
2424
+ max_depth: Optional[int] = None,
2425
+ time_limit: Optional[int] = None,
2426
+ max_urls: Optional[int] = None,
2427
+ analysis_prompt: Optional[str] = None,
2428
+ system_prompt: Optional[str] = None,
2429
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2430
+ """
2431
+ Initiates an asynchronous deep research operation.
2432
+
2433
+ Args:
2434
+ query (str): Research query or topic to investigate
2435
+ max_depth (Optional[int]): Maximum depth of research exploration
2436
+ time_limit (Optional[int]): Time limit in seconds for research
2437
+ max_urls (Optional[int]): Maximum number of URLs to process
2438
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2439
+ system_prompt (Optional[str]): Custom system prompt
2440
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2441
+
2442
+ Returns:
2443
+ Dict[str, Any]: A response containing:
2444
+ * success (bool): Whether the research initiation was successful
2445
+ * id (str): The unique identifier for the research job
2446
+ * error (str, optional): Error message if initiation failed
2447
+
2448
+ Raises:
2449
+ Exception: If the research initiation fails.
2450
+ """
2451
+ research_params = {}
2452
+ if max_depth is not None:
2453
+ research_params['maxDepth'] = max_depth
2454
+ if time_limit is not None:
2455
+ research_params['timeLimit'] = time_limit
2456
+ if max_urls is not None:
2457
+ research_params['maxUrls'] = max_urls
2458
+ if analysis_prompt is not None:
2459
+ research_params['analysisPrompt'] = analysis_prompt
2460
+ if system_prompt is not None:
2461
+ research_params['systemPrompt'] = system_prompt
2462
+ if __experimental_stream_steps is not None:
2463
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2464
+ research_params = DeepResearchParams(**research_params)
2465
+
2466
+ headers = self._prepare_headers()
2467
+
2468
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2469
+ json_data['origin'] = f"python-sdk@{version}"
2470
+
2471
+ # Handle json options schema if present
2472
+ if 'jsonOptions' in json_data:
2473
+ json_opts = json_data['jsonOptions']
2474
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2475
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2476
+
2477
+ try:
2478
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2479
+ if response.status_code == 200:
2480
+ try:
2481
+ return response.json()
2482
+ except:
2483
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2484
+ else:
2485
+ self._handle_error(response, 'start deep research')
2486
+ except Exception as e:
2487
+ raise ValueError(str(e))
2488
+
2489
+ return {'success': False, 'error': 'Internal server error'}
2490
+
2491
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2492
+ """
2493
+ Check the status of a deep research operation.
2494
+
2495
+ Args:
2496
+ id (str): The ID of the deep research operation.
2497
+
2498
+ Returns:
2499
+ DeepResearchResponse containing:
2500
+
2501
+ Status:
2502
+ * success - Whether research completed successfully
2503
+ * status - Current state (processing/completed/failed)
2504
+ * error - Error message if failed
2505
+
2506
+ Results:
2507
+ * id - Unique identifier for the research job
2508
+ * data - Research findings and analysis
2509
+ * sources - List of discovered sources
2510
+ * activities - Research progress log
2511
+ * summaries - Generated research summaries
2512
+
2513
+ Raises:
2514
+ Exception: If the status check fails.
2515
+ """
2516
+ headers = self._prepare_headers()
2517
+ try:
2518
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2519
+ if response.status_code == 200:
2520
+ try:
2521
+ return response.json()
2522
+ except:
2523
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2524
+ elif response.status_code == 404:
2525
+ raise Exception('Deep research job not found')
2526
+ else:
2527
+ self._handle_error(response, 'check deep research status')
2528
+ except Exception as e:
2529
+ raise ValueError(str(e))
2530
+
2531
+ return {'success': False, 'error': 'Internal server error'}
2532
+
2533
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2534
+ """
2535
+ Validate additional keyword arguments before they are passed to the API.
2536
+ This provides early validation before the Pydantic model validation.
2537
+
2538
+ Args:
2539
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2540
+ method_name (str): Name of the method these kwargs are for
2541
+
2542
+ Raises:
2543
+ ValueError: If kwargs contain invalid or unsupported parameters
2544
+ """
2545
+ if not kwargs:
2546
+ return
2547
+
2548
+ # Known parameter mappings for each method
2549
+ method_params = {
2550
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2551
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2552
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
2553
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2554
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2555
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2556
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2557
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2558
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2559
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2560
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2561
+ "actions", "agent", "webhook"},
2562
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2563
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2564
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2565
+ "actions", "agent", "webhook"},
2566
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2567
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2568
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2569
+ "actions", "agent", "webhook"}
2570
+ }
2571
+
2572
+ # Get allowed parameters for this method
2573
+ allowed_params = method_params.get(method_name, set())
2574
+
2575
+ # Check for unknown parameters
2576
+ unknown_params = set(kwargs.keys()) - allowed_params
2577
+ if unknown_params:
2578
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2579
+
2580
+ # Additional type validation can be added here if needed
2581
+ # For now, we rely on Pydantic models for detailed type validation
2582
+
2583
+ def _ensure_schema_dict(self, schema):
2584
+ """
2585
+ Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2586
+ """
2587
+ if schema is None:
2588
+ return schema
2589
+ if isinstance(schema, type):
2590
+ # Pydantic v1/v2 model class
2591
+ if hasattr(schema, 'model_json_schema'):
2592
+ return schema.model_json_schema()
2593
+ elif hasattr(schema, 'schema'):
2594
+ return schema.schema()
2595
+ if isinstance(schema, dict):
2596
+ return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2597
+ if isinstance(schema, (list, tuple)):
2598
+ return [self._ensure_schema_dict(v) for v in schema]
2599
+ return schema
2600
+
2601
+ class CrawlWatcher:
2602
+ """
2603
+ A class to watch and handle crawl job events via WebSocket connection.
2604
+
2605
+ Attributes:
2606
+ id (str): The ID of the crawl job to watch
2607
+ app (FirecrawlApp): The FirecrawlApp instance
2608
+ data (List[Dict[str, Any]]): List of crawled documents/data
2609
+ status (str): Current status of the crawl job
2610
+ ws_url (str): WebSocket URL for the crawl job
2611
+ event_handlers (dict): Dictionary of event type to list of handler functions
2612
+ """
2613
+ def __init__(self, id: str, app: FirecrawlApp):
2614
+ self.id = id
2615
+ self.app = app
2616
+ self.data: List[Dict[str, Any]] = []
2617
+ self.status = "scraping"
2618
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2619
+ self.event_handlers = {
2620
+ 'done': [],
2621
+ 'error': [],
2622
+ 'document': []
2623
+ }
2624
+
2625
+ async def connect(self) -> None:
2626
+ """
2627
+ Establishes WebSocket connection and starts listening for messages.
2628
+ """
2629
+ async with websockets.connect(
2630
+ self.ws_url,
2631
+ max_size=None,
2632
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2633
+ ) as websocket:
2634
+ await self._listen(websocket)
2635
+
2636
+ async def _listen(self, websocket) -> None:
2637
+ """
2638
+ Listens for incoming WebSocket messages and handles them.
2639
+
2640
+ Args:
2641
+ websocket: The WebSocket connection object
2642
+ """
2643
+ async for message in websocket:
2644
+ msg = json.loads(message)
2645
+ await self._handle_message(msg)
2646
+
2647
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2648
+ """
2649
+ Adds an event handler function for a specific event type.
2650
+
2651
+ Args:
2652
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2653
+ handler (Callable): Function to handle the event
2654
+ """
2655
+ if event_type in self.event_handlers:
2656
+ self.event_handlers[event_type].append(handler)
2657
+
2658
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2659
+ """
2660
+ Dispatches an event to all registered handlers for that event type.
2661
+
2662
+ Args:
2663
+ event_type (str): Type of event to dispatch
2664
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2665
+ """
2666
+ if event_type in self.event_handlers:
2667
+ for handler in self.event_handlers[event_type]:
2668
+ handler(detail)
2669
+
2670
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2671
+ """
2672
+ Handles incoming WebSocket messages based on their type.
2673
+
2674
+ Args:
2675
+ msg (Dict[str, Any]): The message to handle
2676
+ """
2677
+ if msg['type'] == 'done':
2678
+ self.status = 'completed'
2679
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2680
+ elif msg['type'] == 'error':
2681
+ self.status = 'failed'
2682
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2683
+ elif msg['type'] == 'catchup':
2684
+ self.status = msg['data']['status']
2685
+ self.data.extend(msg['data'].get('data', []))
2686
+ for doc in self.data:
2687
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2688
+ elif msg['type'] == 'document':
2689
+ self.data.append(msg['data'])
2690
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2691
+
2692
+ class AsyncFirecrawlApp(FirecrawlApp):
2693
+ """
2694
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2695
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2696
+ """
2697
+
2698
+ async def _async_request(
2699
+ self,
2700
+ method: str,
2701
+ url: str,
2702
+ headers: Dict[str, str],
2703
+ data: Optional[Dict[str, Any]] = None,
2704
+ retries: int = 3,
2705
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2706
+ """
2707
+ Generic async request method with exponential backoff retry logic.
2708
+
2709
+ Args:
2710
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2711
+ url (str): The URL to send the request to.
2712
+ headers (Dict[str, str]): Headers to include in the request.
2713
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2714
+ retries (int): Maximum number of retry attempts (default: 3).
2715
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2716
+ Delay will be backoff_factor * (2 ** retry_count).
2717
+
2718
+ Returns:
2719
+ Dict[str, Any]: The parsed JSON response from the server.
2720
+
2721
+ Raises:
2722
+ aiohttp.ClientError: If the request fails after all retries.
2723
+ Exception: If max retries are exceeded or other errors occur.
2724
+ """
2725
+ async with aiohttp.ClientSession() as session:
2726
+ for attempt in range(retries):
2727
+ try:
2728
+ async with session.request(
2729
+ method=method, url=url, headers=headers, json=data
2730
+ ) as response:
2731
+ if response.status == 502:
2732
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2733
+ continue
2734
+ if response.status >= 300:
2735
+ await self._handle_error(response, f"make {method} request")
2736
+ return await response.json()
2737
+ except aiohttp.ClientError as e:
2738
+ if attempt == retries - 1:
2739
+ raise e
2740
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2741
+ raise Exception("Max retries exceeded")
2742
+
2743
+ async def _async_post_request(
2744
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2745
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2746
+ """
2747
+ Make an async POST request with exponential backoff retry logic.
2748
+
2749
+ Args:
2750
+ url (str): The URL to send the POST request to.
2751
+ data (Dict[str, Any]): The JSON data to include in the request body.
2752
+ headers (Dict[str, str]): Headers to include in the request.
2753
+ retries (int): Maximum number of retry attempts (default: 3).
2754
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2755
+ Delay will be backoff_factor * (2 ** retry_count).
2756
+
2757
+ Returns:
2758
+ Dict[str, Any]: The parsed JSON response from the server.
2759
+
2760
+ Raises:
2761
+ aiohttp.ClientError: If the request fails after all retries.
2762
+ Exception: If max retries are exceeded or other errors occur.
2763
+ """
2764
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2765
+
2766
+ async def _async_get_request(
2767
+ self, url: str, headers: Dict[str, str],
2768
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2769
+ """
2770
+ Make an async GET request with exponential backoff retry logic.
2771
+
2772
+ Args:
2773
+ url (str): The URL to send the GET request to.
2774
+ headers (Dict[str, str]): Headers to include in the request.
2775
+ retries (int): Maximum number of retry attempts (default: 3).
2776
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2777
+ Delay will be backoff_factor * (2 ** retry_count).
2778
+
2779
+ Returns:
2780
+ Dict[str, Any]: The parsed JSON response from the server.
2781
+
2782
+ Raises:
2783
+ aiohttp.ClientError: If the request fails after all retries.
2784
+ Exception: If max retries are exceeded or other errors occur.
2785
+ """
2786
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2787
+
2788
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2789
+ """
2790
+ Handle errors from async API responses with detailed error messages.
2791
+
2792
+ Args:
2793
+ response (aiohttp.ClientResponse): The response object from the failed request
2794
+ action (str): Description of the action that was being attempted
2795
+
2796
+ Raises:
2797
+ aiohttp.ClientError: With a detailed error message based on the response status:
2798
+ - 402: Payment Required
2799
+ - 408: Request Timeout
2800
+ - 409: Conflict
2801
+ - 500: Internal Server Error
2802
+ - Other: Unexpected error with status code
2803
+ """
2804
+ try:
2805
+ error_data = await response.json()
2806
+ error_message = error_data.get('error', 'No error message provided.')
2807
+ error_details = error_data.get('details', 'No additional error details provided.')
2808
+ except:
2809
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2810
+
2811
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2812
+
2813
+ raise aiohttp.ClientError(message)
2814
+
2815
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2816
+ """
2817
+ Generate a standardized error message based on HTTP status code for async operations.
2818
+
2819
+ Args:
2820
+ status_code (int): The HTTP status code from the response
2821
+ action (str): Description of the action that was being performed
2822
+ error_message (str): The error message from the API response
2823
+ error_details (str): Additional error details from the API response
2824
+
2825
+ Returns:
2826
+ str: A formatted error message
2827
+ """
2828
+ return self._get_error_message(status_code, action, error_message, error_details)
2829
+
2830
+ async def crawl_url_and_watch(
2831
+ self,
2832
+ url: str,
2833
+ params: Optional[CrawlParams] = None,
2834
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2835
+ """
2836
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2837
+
2838
+ Args:
2839
+ url (str): Target URL to start crawling from
2840
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2841
+ URL Discovery:
2842
+ * includePaths - Patterns of URLs to include
2843
+ * excludePaths - Patterns of URLs to exclude
2844
+ * maxDepth - Maximum crawl depth
2845
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2846
+ * limit - Maximum pages to crawl
2847
+
2848
+ Link Following:
2849
+ * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2850
+ * crawlEntireDomain - Follow parent directory links
2851
+ * allowExternalLinks - Follow external domain links
2852
+ * ignoreSitemap - Skip sitemap.xml processing
2853
+
2854
+ Advanced:
2855
+ * scrapeOptions - Page scraping configuration
2856
+ * webhook - Notification webhook settings
2857
+ * deduplicateSimilarURLs - Remove similar URLs
2858
+ * ignoreQueryParameters - Ignore URL parameters
2859
+ * regexOnFullURL - Apply regex to full URLs
2860
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2861
+
2862
+ Returns:
2863
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2864
+
2865
+ Raises:
2866
+ Exception: If crawl job fails to start
2867
+ """
2868
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2869
+ if crawl_response.get('success') and 'id' in crawl_response:
2870
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2871
+ else:
2872
+ raise Exception("Crawl job failed to start")
2873
+
2874
+ async def batch_scrape_urls_and_watch(
2875
+ self,
2876
+ urls: List[str],
2877
+ params: Optional[ScrapeParams] = None,
2878
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2879
+ """
2880
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2881
+
2882
+ Args:
2883
+ urls (List[str]): List of URLs to scrape
2884
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2885
+
2886
+ Content Options:
2887
+ * formats - Content formats to retrieve
2888
+ * includeTags - HTML tags to include
2889
+ * excludeTags - HTML tags to exclude
2890
+ * onlyMainContent - Extract main content only
2891
+
2892
+ Request Options:
2893
+ * headers - Custom HTTP headers
2894
+ * timeout - Request timeout (ms)
2895
+ * mobile - Use mobile user agent
2896
+ * proxy - Proxy type
2897
+
2898
+ Extraction Options:
2899
+ * extract - Content extraction config
2900
+ * jsonOptions - JSON extraction config
2901
+ * actions - Actions to perform
2902
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2903
+
2904
+ Returns:
2905
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2906
+
2907
+ Raises:
2908
+ Exception: If batch scrape job fails to start
2909
+ """
2910
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2911
+ if batch_response.get('success') and 'id' in batch_response:
2912
+ return AsyncCrawlWatcher(batch_response['id'], self)
2913
+ else:
2914
+ raise Exception("Batch scrape job failed to start")
2915
+
2916
+ async def scrape_url(
2917
+ self,
2918
+ url: str,
2919
+ *,
2920
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2921
+ include_tags: Optional[List[str]] = None,
2922
+ exclude_tags: Optional[List[str]] = None,
2923
+ only_main_content: Optional[bool] = None,
2924
+ wait_for: Optional[int] = None,
2925
+ timeout: Optional[int] = None,
2926
+ location: Optional[LocationConfig] = None,
2927
+ mobile: Optional[bool] = None,
2928
+ skip_tls_verification: Optional[bool] = None,
2929
+ remove_base64_images: Optional[bool] = None,
2930
+ block_ads: Optional[bool] = None,
2931
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2932
+ parse_pdf: Optional[bool] = None,
2933
+ extract: Optional[JsonConfig] = None,
2934
+ json_options: Optional[JsonConfig] = None,
2935
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
2936
+ **kwargs) -> ScrapeResponse[Any]:
2937
+ """
2938
+ Scrape a single URL asynchronously.
2939
+
2940
+ Args:
2941
+ url (str): Target URL to scrape
2942
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2943
+ include_tags (Optional[List[str]]): HTML tags to include
2944
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2945
+ only_main_content (Optional[bool]): Extract main content only
2946
+ wait_for (Optional[int]): Wait for a specific element to appear
2947
+ timeout (Optional[int]): Request timeout (ms)
2948
+ location (Optional[LocationConfig]): Location configuration
2949
+ mobile (Optional[bool]): Use mobile user agent
2950
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2951
+ remove_base64_images (Optional[bool]): Remove base64 images
2952
+ block_ads (Optional[bool]): Block ads
2953
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
2954
+ extract (Optional[JsonConfig]): Content extraction settings
2955
+ json_options (Optional[JsonConfig]): JSON extraction settings
2956
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
2957
+ **kwargs: Additional parameters to pass to the API
2958
+
2959
+ Returns:
2960
+ ScrapeResponse with:
2961
+ * success - Whether scrape was successful
2962
+ * markdown - Markdown content if requested
2963
+ * html - HTML content if requested
2964
+ * rawHtml - Raw HTML content if requested
2965
+ * links - Extracted links if requested
2966
+ * screenshot - Screenshot if requested
2967
+ * extract - Extracted data if requested
2968
+ * json - JSON data if requested
2969
+ * error - Error message if scrape failed
2970
+
2971
+ Raises:
2972
+ Exception: If scraping fails
2973
+ """
2974
+ # Validate any additional kwargs
2975
+ self._validate_kwargs(kwargs, "scrape_url")
2976
+
2977
+ headers = self._prepare_headers()
2978
+
2979
+ # Build scrape parameters
2980
+ scrape_params = {
2981
+ 'url': url,
2982
+ 'origin': f"python-sdk@{version}"
2983
+ }
2984
+
2985
+ # Add optional parameters if provided and not None
2986
+ if formats:
2987
+ scrape_params['formats'] = formats
2988
+ if include_tags:
2989
+ scrape_params['includeTags'] = include_tags
2990
+ if exclude_tags:
2991
+ scrape_params['excludeTags'] = exclude_tags
2992
+ if only_main_content is not None:
2993
+ scrape_params['onlyMainContent'] = only_main_content
2994
+ if wait_for:
2995
+ scrape_params['waitFor'] = wait_for
2996
+ if timeout:
2997
+ scrape_params['timeout'] = timeout
2998
+ if location:
2999
+ scrape_params['location'] = location.dict(exclude_none=True)
3000
+ if mobile is not None:
3001
+ scrape_params['mobile'] = mobile
3002
+ if skip_tls_verification is not None:
3003
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3004
+ if remove_base64_images is not None:
3005
+ scrape_params['removeBase64Images'] = remove_base64_images
3006
+ if block_ads is not None:
3007
+ scrape_params['blockAds'] = block_ads
3008
+ if proxy:
3009
+ scrape_params['proxy'] = proxy
3010
+ if parse_pdf is not None:
3011
+ scrape_params['parsePDF'] = parse_pdf
3012
+ if extract is not None:
3013
+ extract = self._ensure_schema_dict(extract)
3014
+ if isinstance(extract, dict) and "schema" in extract:
3015
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3016
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3017
+ if json_options is not None:
3018
+ json_options = self._ensure_schema_dict(json_options)
3019
+ if isinstance(json_options, dict) and "schema" in json_options:
3020
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3021
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3022
+ if actions:
3023
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
3024
+
3025
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
3026
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
3027
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3028
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3029
+
3030
+ # Make async request
3031
+ endpoint = f'/v1/scrape'
3032
+ response = await self._async_post_request(
3033
+ f'{self.api_url}{endpoint}',
3034
+ scrape_params,
3035
+ headers
3036
+ )
3037
+
3038
+ if response.get('success') and 'data' in response:
3039
+ return ScrapeResponse(**response['data'])
3040
+ elif "error" in response:
3041
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3042
+ else:
3043
+ # Use the response content directly if possible, otherwise a generic message
3044
+ error_content = response.get('error', str(response))
3045
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
3046
+
3047
+ async def batch_scrape_urls(
3048
+ self,
3049
+ urls: List[str],
3050
+ *,
3051
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3052
+ headers: Optional[Dict[str, str]] = None,
3053
+ include_tags: Optional[List[str]] = None,
3054
+ exclude_tags: Optional[List[str]] = None,
3055
+ only_main_content: Optional[bool] = None,
3056
+ wait_for: Optional[int] = None,
3057
+ timeout: Optional[int] = None,
3058
+ location: Optional[LocationConfig] = None,
3059
+ mobile: Optional[bool] = None,
3060
+ skip_tls_verification: Optional[bool] = None,
3061
+ remove_base64_images: Optional[bool] = None,
3062
+ block_ads: Optional[bool] = None,
3063
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3064
+ extract: Optional[JsonConfig] = None,
3065
+ json_options: Optional[JsonConfig] = None,
3066
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3067
+ agent: Optional[AgentOptions] = None,
3068
+ poll_interval: Optional[int] = 2,
3069
+ idempotency_key: Optional[str] = None,
3070
+ **kwargs
3071
+ ) -> BatchScrapeStatusResponse:
3072
+ """
3073
+ Asynchronously scrape multiple URLs and monitor until completion.
3074
+
3075
+ Args:
3076
+ urls (List[str]): URLs to scrape
3077
+ formats (Optional[List[Literal]]): Content formats to retrieve
3078
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3079
+ include_tags (Optional[List[str]]): HTML tags to include
3080
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3081
+ only_main_content (Optional[bool]): Extract main content only
3082
+ wait_for (Optional[int]): Wait time in milliseconds
3083
+ timeout (Optional[int]): Request timeout in milliseconds
3084
+ location (Optional[LocationConfig]): Location configuration
3085
+ mobile (Optional[bool]): Use mobile user agent
3086
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3087
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3088
+ block_ads (Optional[bool]): Block advertisements
3089
+ proxy (Optional[Literal]): Proxy type to use
3090
+ extract (Optional[JsonConfig]): Content extraction config
3091
+ json_options (Optional[JsonConfig]): JSON extraction config
3092
+ actions (Optional[List[Union]]): Actions to perform
3093
+ agent (Optional[AgentOptions]): Agent configuration
3094
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3095
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3096
+ **kwargs: Additional parameters to pass to the API
3097
+
3098
+ Returns:
3099
+ BatchScrapeStatusResponse with:
3100
+ * Scraping status and progress
3101
+ * Scraped content for each URL
3102
+ * Success/error information
3103
+
3104
+ Raises:
3105
+ Exception: If batch scrape fails
3106
+ """
3107
+ # Validate any additional kwargs
3108
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
3109
+
3110
+ scrape_params = {}
3111
+
3112
+ # Add individual parameters
3113
+ if formats is not None:
3114
+ scrape_params['formats'] = formats
3115
+ if headers is not None:
3116
+ scrape_params['headers'] = headers
3117
+ if include_tags is not None:
3118
+ scrape_params['includeTags'] = include_tags
3119
+ if exclude_tags is not None:
3120
+ scrape_params['excludeTags'] = exclude_tags
3121
+ if only_main_content is not None:
3122
+ scrape_params['onlyMainContent'] = only_main_content
3123
+ if wait_for is not None:
3124
+ scrape_params['waitFor'] = wait_for
3125
+ if timeout is not None:
3126
+ scrape_params['timeout'] = timeout
3127
+ if location is not None:
3128
+ scrape_params['location'] = location.dict(exclude_none=True)
3129
+ if mobile is not None:
3130
+ scrape_params['mobile'] = mobile
3131
+ if skip_tls_verification is not None:
3132
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3133
+ if remove_base64_images is not None:
3134
+ scrape_params['removeBase64Images'] = remove_base64_images
3135
+ if block_ads is not None:
3136
+ scrape_params['blockAds'] = block_ads
3137
+ if proxy is not None:
3138
+ scrape_params['proxy'] = proxy
3139
+ if extract is not None:
3140
+ extract = self._ensure_schema_dict(extract)
3141
+ if isinstance(extract, dict) and "schema" in extract:
3142
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3143
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3144
+ if json_options is not None:
3145
+ json_options = self._ensure_schema_dict(json_options)
3146
+ if isinstance(json_options, dict) and "schema" in json_options:
3147
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3148
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3149
+ if actions is not None:
3150
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3151
+ if agent is not None:
3152
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3153
+
3154
+ # Add any additional kwargs
3155
+ scrape_params.update(kwargs)
3156
+
3157
+ # Create final params object
3158
+ final_params = ScrapeParams(**scrape_params)
3159
+ params_dict = final_params.dict(exclude_none=True)
3160
+ params_dict['urls'] = urls
3161
+ params_dict['origin'] = f"python-sdk@{version}"
3162
+
3163
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3164
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3165
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3166
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3167
+
3168
+ # Make request
3169
+ headers = self._prepare_headers(idempotency_key)
3170
+ response = await self._async_post_request(
3171
+ f'{self.api_url}/v1/batch/scrape',
3172
+ params_dict,
3173
+ headers
3174
+ )
3175
+
3176
+ if response.get('success'):
3177
+ try:
3178
+ id = response.get('id')
3179
+ except:
3180
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3181
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3182
+ else:
3183
+ self._handle_error(response, 'start batch scrape job')
3184
+
3185
+
3186
+ async def async_batch_scrape_urls(
3187
+ self,
3188
+ urls: List[str],
3189
+ *,
3190
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3191
+ headers: Optional[Dict[str, str]] = None,
3192
+ include_tags: Optional[List[str]] = None,
3193
+ exclude_tags: Optional[List[str]] = None,
3194
+ only_main_content: Optional[bool] = None,
3195
+ wait_for: Optional[int] = None,
3196
+ timeout: Optional[int] = None,
3197
+ location: Optional[LocationConfig] = None,
3198
+ mobile: Optional[bool] = None,
3199
+ skip_tls_verification: Optional[bool] = None,
3200
+ remove_base64_images: Optional[bool] = None,
3201
+ block_ads: Optional[bool] = None,
3202
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3203
+ extract: Optional[JsonConfig] = None,
3204
+ json_options: Optional[JsonConfig] = None,
3205
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3206
+ agent: Optional[AgentOptions] = None,
3207
+ idempotency_key: Optional[str] = None,
3208
+ **kwargs
3209
+ ) -> BatchScrapeResponse:
3210
+ """
3211
+ Initiate a batch scrape job asynchronously.
3212
+
3213
+ Args:
3214
+ urls (List[str]): URLs to scrape
3215
+ formats (Optional[List[Literal]]): Content formats to retrieve
3216
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3217
+ include_tags (Optional[List[str]]): HTML tags to include
3218
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3219
+ only_main_content (Optional[bool]): Extract main content only
3220
+ wait_for (Optional[int]): Wait time in milliseconds
3221
+ timeout (Optional[int]): Request timeout in milliseconds
3222
+ location (Optional[LocationConfig]): Location configuration
3223
+ mobile (Optional[bool]): Use mobile user agent
3224
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3225
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3226
+ block_ads (Optional[bool]): Block advertisements
3227
+ proxy (Optional[Literal]): Proxy type to use
3228
+ extract (Optional[JsonConfig]): Content extraction config
3229
+ json_options (Optional[JsonConfig]): JSON extraction config
3230
+ actions (Optional[List[Union]]): Actions to perform
3231
+ agent (Optional[AgentOptions]): Agent configuration
3232
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3233
+ **kwargs: Additional parameters to pass to the API
3234
+
3235
+ Returns:
3236
+ BatchScrapeResponse with:
3237
+ * success - Whether job started successfully
3238
+ * id - Unique identifier for the job
3239
+ * url - Status check URL
3240
+ * error - Error message if start failed
3241
+
3242
+ Raises:
3243
+ Exception: If job initiation fails
3244
+ """
3245
+ # Validate any additional kwargs
3246
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3247
+
3248
+ scrape_params = {}
3249
+
3250
+ # Add individual parameters
3251
+ if formats is not None:
3252
+ scrape_params['formats'] = formats
3253
+ if headers is not None:
3254
+ scrape_params['headers'] = headers
3255
+ if include_tags is not None:
3256
+ scrape_params['includeTags'] = include_tags
3257
+ if exclude_tags is not None:
3258
+ scrape_params['excludeTags'] = exclude_tags
3259
+ if only_main_content is not None:
3260
+ scrape_params['onlyMainContent'] = only_main_content
3261
+ if wait_for is not None:
3262
+ scrape_params['waitFor'] = wait_for
3263
+ if timeout is not None:
3264
+ scrape_params['timeout'] = timeout
3265
+ if location is not None:
3266
+ scrape_params['location'] = location.dict(exclude_none=True)
3267
+ if mobile is not None:
3268
+ scrape_params['mobile'] = mobile
3269
+ if skip_tls_verification is not None:
3270
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3271
+ if remove_base64_images is not None:
3272
+ scrape_params['removeBase64Images'] = remove_base64_images
3273
+ if block_ads is not None:
3274
+ scrape_params['blockAds'] = block_ads
3275
+ if proxy is not None:
3276
+ scrape_params['proxy'] = proxy
3277
+ if extract is not None:
3278
+ extract = self._ensure_schema_dict(extract)
3279
+ if isinstance(extract, dict) and "schema" in extract:
3280
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3281
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3282
+ if json_options is not None:
3283
+ json_options = self._ensure_schema_dict(json_options)
3284
+ if isinstance(json_options, dict) and "schema" in json_options:
3285
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3286
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3287
+ if actions is not None:
3288
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3289
+ if agent is not None:
3290
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3291
+
3292
+ # Add any additional kwargs
3293
+ scrape_params.update(kwargs)
3294
+
3295
+ # Create final params object
3296
+ final_params = ScrapeParams(**scrape_params)
3297
+ params_dict = final_params.dict(exclude_none=True)
3298
+ params_dict['urls'] = urls
3299
+ params_dict['origin'] = f"python-sdk@{version}"
3300
+
3301
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3302
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3303
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3304
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3305
+
3306
+ # Make request
3307
+ headers = self._prepare_headers(idempotency_key)
3308
+ response = await self._async_post_request(
3309
+ f'{self.api_url}/v1/batch/scrape',
3310
+ params_dict,
3311
+ headers
3312
+ )
3313
+
3314
+ if response.get('status_code') == 200:
3315
+ try:
3316
+ return BatchScrapeResponse(**response.json())
3317
+ except:
3318
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3319
+ else:
3320
+ self._handle_error(response, 'start batch scrape job')
3321
+
3322
+ async def crawl_url(
3323
+ self,
3324
+ url: str,
3325
+ *,
3326
+ include_paths: Optional[List[str]] = None,
3327
+ exclude_paths: Optional[List[str]] = None,
3328
+ max_depth: Optional[int] = None,
3329
+ max_discovery_depth: Optional[int] = None,
3330
+ limit: Optional[int] = None,
3331
+ allow_backward_links: Optional[bool] = None,
3332
+ crawl_entire_domain: Optional[bool] = None,
3333
+ allow_external_links: Optional[bool] = None,
3334
+ ignore_sitemap: Optional[bool] = None,
3335
+ scrape_options: Optional[ScrapeOptions] = None,
3336
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3337
+ deduplicate_similar_urls: Optional[bool] = None,
3338
+ ignore_query_parameters: Optional[bool] = None,
3339
+ regex_on_full_url: Optional[bool] = None,
3340
+ delay: Optional[int] = None,
3341
+ allow_subdomains: Optional[bool] = None,
3342
+ poll_interval: Optional[int] = 2,
3343
+ idempotency_key: Optional[str] = None,
3344
+ **kwargs
3345
+ ) -> CrawlStatusResponse:
3346
+ """
3347
+ Crawl a website starting from a URL.
3348
+
3349
+ Args:
3350
+ url (str): Target URL to start crawling from
3351
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3352
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3353
+ max_depth (Optional[int]): Maximum crawl depth
3354
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3355
+ limit (Optional[int]): Maximum pages to crawl
3356
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3357
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3358
+ allow_external_links (Optional[bool]): Follow external domain links
3359
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3360
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3361
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3362
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3363
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3364
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3365
+ delay (Optional[int]): Delay in seconds between scrapes
3366
+ allow_subdomains (Optional[bool]): Follow subdomains
3367
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3368
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3369
+ **kwargs: Additional parameters to pass to the API
3370
+
3371
+ Returns:
3372
+ CrawlStatusResponse with:
3373
+ * Crawling status and progress
3374
+ * Crawled page contents
3375
+ * Success/error information
3376
+
3377
+ Raises:
3378
+ Exception: If crawl fails
3379
+ """
3380
+ # Validate any additional kwargs
3381
+ self._validate_kwargs(kwargs, "crawl_url")
3382
+
3383
+ crawl_params = {}
3384
+
3385
+ # Add individual parameters
3386
+ if include_paths is not None:
3387
+ crawl_params['includePaths'] = include_paths
3388
+ if exclude_paths is not None:
3389
+ crawl_params['excludePaths'] = exclude_paths
3390
+ if max_depth is not None:
3391
+ crawl_params['maxDepth'] = max_depth
3392
+ if max_discovery_depth is not None:
3393
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3394
+ if limit is not None:
3395
+ crawl_params['limit'] = limit
3396
+ if crawl_entire_domain is not None:
3397
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3398
+ elif allow_backward_links is not None:
3399
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3400
+ if allow_external_links is not None:
3401
+ crawl_params['allowExternalLinks'] = allow_external_links
3402
+ if ignore_sitemap is not None:
3403
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3404
+ if scrape_options is not None:
3405
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3406
+ if webhook is not None:
3407
+ crawl_params['webhook'] = webhook
3408
+ if deduplicate_similar_urls is not None:
3409
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3410
+ if ignore_query_parameters is not None:
3411
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3412
+ if regex_on_full_url is not None:
3413
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3414
+ if delay is not None:
3415
+ crawl_params['delay'] = delay
3416
+ if allow_subdomains is not None:
3417
+ crawl_params['allowSubdomains'] = allow_subdomains
3418
+
3419
+ # Add any additional kwargs
3420
+ crawl_params.update(kwargs)
3421
+
3422
+ # Create final params object
3423
+ final_params = CrawlParams(**crawl_params)
3424
+ params_dict = final_params.dict(exclude_none=True)
3425
+ params_dict['url'] = url
3426
+ params_dict['origin'] = f"python-sdk@{version}"
3427
+ # Make request
3428
+ headers = self._prepare_headers(idempotency_key)
3429
+ response = await self._async_post_request(
3430
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3431
+
3432
+ if response.get('success'):
3433
+ try:
3434
+ id = response.get('id')
3435
+ except:
3436
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3437
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3438
+ else:
3439
+ self._handle_error(response, 'start crawl job')
3440
+
3441
+
3442
+ async def async_crawl_url(
3443
+ self,
3444
+ url: str,
3445
+ *,
3446
+ include_paths: Optional[List[str]] = None,
3447
+ exclude_paths: Optional[List[str]] = None,
3448
+ max_depth: Optional[int] = None,
3449
+ max_discovery_depth: Optional[int] = None,
3450
+ limit: Optional[int] = None,
3451
+ allow_backward_links: Optional[bool] = None,
3452
+ crawl_entire_domain: Optional[bool] = None,
3453
+ allow_external_links: Optional[bool] = None,
3454
+ ignore_sitemap: Optional[bool] = None,
3455
+ scrape_options: Optional[ScrapeOptions] = None,
3456
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3457
+ deduplicate_similar_urls: Optional[bool] = None,
3458
+ ignore_query_parameters: Optional[bool] = None,
3459
+ regex_on_full_url: Optional[bool] = None,
3460
+ delay: Optional[int] = None,
3461
+ allow_subdomains: Optional[bool] = None,
3462
+ poll_interval: Optional[int] = 2,
3463
+ idempotency_key: Optional[str] = None,
3464
+ **kwargs
3465
+ ) -> CrawlResponse:
3466
+ """
3467
+ Start an asynchronous crawl job.
3468
+
3469
+ Args:
3470
+ url (str): Target URL to start crawling from
3471
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3472
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3473
+ max_depth (Optional[int]): Maximum crawl depth
3474
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3475
+ limit (Optional[int]): Maximum pages to crawl
3476
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3477
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3478
+ allow_external_links (Optional[bool]): Follow external domain links
3479
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3480
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3481
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3482
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3483
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3484
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3485
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3486
+ **kwargs: Additional parameters to pass to the API
3487
+
3488
+ Returns:
3489
+ CrawlResponse with:
3490
+ * success - Whether crawl started successfully
3491
+ * id - Unique identifier for the crawl job
3492
+ * url - Status check URL for the crawl
3493
+ * error - Error message if start failed
3494
+
3495
+ Raises:
3496
+ Exception: If crawl initiation fails
3497
+ """
3498
+ crawl_params = {}
3499
+
3500
+ # Add individual parameters
3501
+ if include_paths is not None:
3502
+ crawl_params['includePaths'] = include_paths
3503
+ if exclude_paths is not None:
3504
+ crawl_params['excludePaths'] = exclude_paths
3505
+ if max_depth is not None:
3506
+ crawl_params['maxDepth'] = max_depth
3507
+ if max_discovery_depth is not None:
3508
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3509
+ if limit is not None:
3510
+ crawl_params['limit'] = limit
3511
+ if crawl_entire_domain is not None:
3512
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3513
+ elif allow_backward_links is not None:
3514
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3515
+ if allow_external_links is not None:
3516
+ crawl_params['allowExternalLinks'] = allow_external_links
3517
+ if ignore_sitemap is not None:
3518
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3519
+ if scrape_options is not None:
3520
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3521
+ if webhook is not None:
3522
+ crawl_params['webhook'] = webhook
3523
+ if deduplicate_similar_urls is not None:
3524
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3525
+ if ignore_query_parameters is not None:
3526
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3527
+ if regex_on_full_url is not None:
3528
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3529
+ if delay is not None:
3530
+ crawl_params['delay'] = delay
3531
+ if allow_subdomains is not None:
3532
+ crawl_params['allowSubdomains'] = allow_subdomains
3533
+
3534
+ # Add any additional kwargs
3535
+ crawl_params.update(kwargs)
3536
+
3537
+ # Create final params object
3538
+ final_params = CrawlParams(**crawl_params)
3539
+ params_dict = final_params.dict(exclude_none=True)
3540
+ params_dict['url'] = url
3541
+ params_dict['origin'] = f"python-sdk@{version}"
3542
+
3543
+ # Make request
3544
+ headers = self._prepare_headers(idempotency_key)
3545
+ response = await self._async_post_request(
3546
+ f'{self.api_url}/v1/crawl',
3547
+ params_dict,
3548
+ headers
3549
+ )
3550
+
3551
+ if response.get('success'):
3552
+ try:
3553
+ return CrawlResponse(**response)
3554
+ except:
3555
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3556
+ else:
3557
+ self._handle_error(response, 'start crawl job')
3558
+
3559
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3560
+ """
3561
+ Check the status and results of an asynchronous crawl job.
3562
+
3563
+ Args:
3564
+ id (str): Unique identifier for the crawl job
3565
+
3566
+ Returns:
3567
+ CrawlStatusResponse containing:
3568
+ Status Information:
3569
+ * status - Current state (scraping/completed/failed/cancelled)
3570
+ * completed - Number of pages crawled
3571
+ * total - Total pages to crawl
3572
+ * creditsUsed - API credits consumed
3573
+ * expiresAt - Data expiration timestamp
3574
+
3575
+ Results:
3576
+ * data - List of crawled documents
3577
+ * next - URL for next page of results (if paginated)
3578
+ * success - Whether status check succeeded
3579
+ * error - Error message if failed
3580
+
3581
+ Raises:
3582
+ Exception: If status check fails
3583
+ """
3584
+ headers = self._prepare_headers()
3585
+ endpoint = f'/v1/crawl/{id}'
3586
+
3587
+ status_data = await self._async_get_request(
3588
+ f'{self.api_url}{endpoint}',
3589
+ headers
3590
+ )
3591
+
3592
+ if status_data.get('status') == 'completed':
3593
+ if 'data' in status_data:
3594
+ data = status_data['data']
3595
+ while 'next' in status_data:
3596
+ if len(status_data['data']) == 0:
3597
+ break
3598
+ next_url = status_data.get('next')
3599
+ if not next_url:
3600
+ logger.warning("Expected 'next' URL is missing.")
3601
+ break
3602
+ next_data = await self._async_get_request(next_url, headers)
3603
+ data.extend(next_data.get('data', []))
3604
+ status_data = next_data
3605
+ status_data['data'] = data
3606
+ # Create CrawlStatusResponse object from status data
3607
+ response = CrawlStatusResponse(
3608
+ status=status_data.get('status'),
3609
+ total=status_data.get('total'),
3610
+ completed=status_data.get('completed'),
3611
+ creditsUsed=status_data.get('creditsUsed'),
3612
+ expiresAt=status_data.get('expiresAt'),
3613
+ data=status_data.get('data'),
3614
+ success=False if 'error' in status_data else True
3615
+ )
3616
+
3617
+ if 'error' in status_data:
3618
+ response.error = status_data.get('error')
3619
+
3620
+ if 'next' in status_data:
3621
+ response.next = status_data.get('next')
3622
+
3623
+ return response
3624
+
3625
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3626
+ """
3627
+ Monitor the status of an asynchronous job until completion.
3628
+
3629
+ Args:
3630
+ id (str): The ID of the job to monitor
3631
+ headers (Dict[str, str]): Headers to include in status check requests
3632
+ poll_interval (int): Seconds between status checks (default: 2)
3633
+
3634
+ Returns:
3635
+ CrawlStatusResponse: The job results if completed successfully
3636
+
3637
+ Raises:
3638
+ Exception: If the job fails or an error occurs during status checks
3639
+ """
3640
+ while True:
3641
+ status_data = await self._async_get_request(
3642
+ f'{self.api_url}/v1/crawl/{id}',
3643
+ headers
3644
+ )
3645
+
3646
+ if status_data.get('status') == 'completed':
3647
+ if 'data' in status_data:
3648
+ data = status_data['data']
3649
+ while 'next' in status_data:
3650
+ if len(status_data['data']) == 0:
3651
+ break
3652
+ next_url = status_data.get('next')
3653
+ if not next_url:
3654
+ logger.warning("Expected 'next' URL is missing.")
3655
+ break
3656
+ next_data = await self._async_get_request(next_url, headers)
3657
+ data.extend(next_data.get('data', []))
3658
+ status_data = next_data
3659
+ status_data['data'] = data
3660
+ return CrawlStatusResponse(**status_data)
3661
+ else:
3662
+ raise Exception('Job completed but no data was returned')
3663
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3664
+ await asyncio.sleep(max(poll_interval, 2))
3665
+ else:
3666
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3667
+
3668
+ async def map_url(
3669
+ self,
3670
+ url: str,
3671
+ *,
3672
+ search: Optional[str] = None,
3673
+ ignore_sitemap: Optional[bool] = None,
3674
+ include_subdomains: Optional[bool] = None,
3675
+ sitemap_only: Optional[bool] = None,
3676
+ limit: Optional[int] = None,
3677
+ timeout: Optional[int] = None,
3678
+ params: Optional[MapParams] = None) -> MapResponse:
3679
+ """
3680
+ Asynchronously map and discover links from a URL.
3681
+
3682
+ Args:
3683
+ url (str): Target URL to map
3684
+ params (Optional[MapParams]): See MapParams model:
3685
+ Discovery Options:
3686
+ * search - Filter pattern for URLs
3687
+ * ignoreSitemap - Skip sitemap.xml
3688
+ * includeSubdomains - Include subdomain links
3689
+ * sitemapOnly - Only use sitemap.xml
3690
+
3691
+ Limits:
3692
+ * limit - Max URLs to return
3693
+ * timeout - Request timeout (ms)
3694
+
3695
+ Returns:
3696
+ MapResponse with:
3697
+ * Discovered URLs
3698
+ * Success/error status
3699
+
3700
+ Raises:
3701
+ Exception: If mapping fails
3702
+ """
3703
+ map_params = {}
3704
+ if params:
3705
+ map_params.update(params.dict(exclude_none=True))
3706
+
3707
+ # Add individual parameters
3708
+ if search is not None:
3709
+ map_params['search'] = search
3710
+ if ignore_sitemap is not None:
3711
+ map_params['ignoreSitemap'] = ignore_sitemap
3712
+ if include_subdomains is not None:
3713
+ map_params['includeSubdomains'] = include_subdomains
3714
+ if sitemap_only is not None:
3715
+ map_params['sitemapOnly'] = sitemap_only
3716
+ if limit is not None:
3717
+ map_params['limit'] = limit
3718
+ if timeout is not None:
3719
+ map_params['timeout'] = timeout
3720
+
3721
+ # Create final params object
3722
+ final_params = MapParams(**map_params)
3723
+ params_dict = final_params.dict(exclude_none=True)
3724
+ params_dict['url'] = url
3725
+ params_dict['origin'] = f"python-sdk@{version}"
3726
+
3727
+ # Make request
3728
+ endpoint = f'/v1/map'
3729
+ response = await self._async_post_request(
3730
+ f'{self.api_url}{endpoint}',
3731
+ params_dict,
3732
+ headers={"Authorization": f"Bearer {self.api_key}"}
3733
+ )
3734
+
3735
+ if response.get('success') and 'links' in response:
3736
+ return MapResponse(**response)
3737
+ elif 'error' in response:
3738
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3739
+ else:
3740
+ raise Exception(f'Failed to map URL. Error: {response}')
3741
+
3742
+ async def extract(
3743
+ self,
3744
+ urls: Optional[List[str]] = None,
3745
+ *,
3746
+ prompt: Optional[str] = None,
3747
+ schema: Optional[Any] = None,
3748
+ system_prompt: Optional[str] = None,
3749
+ allow_external_links: Optional[bool] = False,
3750
+ enable_web_search: Optional[bool] = False,
3751
+ show_sources: Optional[bool] = False,
3752
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3753
+
3754
+ """
3755
+ Asynchronously extract structured information from URLs.
3756
+
3757
+ Args:
3758
+ urls (Optional[List[str]]): URLs to extract from
3759
+ prompt (Optional[str]): Custom extraction prompt
3760
+ schema (Optional[Any]): JSON schema/Pydantic model
3761
+ system_prompt (Optional[str]): System context
3762
+ allow_external_links (Optional[bool]): Follow external links
3763
+ enable_web_search (Optional[bool]): Enable web search
3764
+ show_sources (Optional[bool]): Include source URLs
3765
+ agent (Optional[Dict[str, Any]]): Agent configuration
3766
+
3767
+ Returns:
3768
+ ExtractResponse with:
3769
+ * Structured data matching schema
3770
+ * Source information if requested
3771
+ * Success/error status
3772
+
3773
+ Raises:
3774
+ ValueError: If prompt/schema missing or extraction fails
3775
+ """
3776
+ headers = self._prepare_headers()
3777
+
3778
+ if not prompt and not schema:
3779
+ raise ValueError("Either prompt or schema is required")
3780
+
3781
+ if not urls and not prompt:
3782
+ raise ValueError("Either urls or prompt is required")
3783
+
3784
+ if schema:
3785
+ schema = self._ensure_schema_dict(schema)
3786
+
3787
+ request_data = {
3788
+ 'urls': urls or [],
3789
+ 'allowExternalLinks': allow_external_links,
3790
+ 'enableWebSearch': enable_web_search,
3791
+ 'showSources': show_sources,
3792
+ 'schema': schema,
3793
+ 'origin': f'python-sdk@{get_version()}'
3794
+ }
3795
+
3796
+ # Only add prompt and systemPrompt if they exist
3797
+ if prompt:
3798
+ request_data['prompt'] = prompt
3799
+ if system_prompt:
3800
+ request_data['systemPrompt'] = system_prompt
3801
+
3802
+ if agent:
3803
+ request_data['agent'] = agent
3804
+
3805
+ response = await self._async_post_request(
3806
+ f'{self.api_url}/v1/extract',
3807
+ request_data,
3808
+ headers
3809
+ )
3810
+
3811
+ if response.get('success'):
3812
+ job_id = response.get('id')
3813
+ if not job_id:
3814
+ raise Exception('Job ID not returned from extract request.')
3815
+
3816
+ while True:
3817
+ status_data = await self._async_get_request(
3818
+ f'{self.api_url}/v1/extract/{job_id}',
3819
+ headers
3820
+ )
3821
+
3822
+ if status_data['status'] == 'completed':
3823
+ return ExtractResponse(**status_data)
3824
+ elif status_data['status'] in ['failed', 'cancelled']:
3825
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3826
+
3827
+ await asyncio.sleep(2)
3828
+ else:
3829
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3830
+
3831
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3832
+ """
3833
+ Check the status of an asynchronous batch scrape job.
3834
+
3835
+ Args:
3836
+ id (str): The ID of the batch scrape job
3837
+
3838
+ Returns:
3839
+ BatchScrapeStatusResponse containing:
3840
+ Status Information:
3841
+ * status - Current state (scraping/completed/failed/cancelled)
3842
+ * completed - Number of URLs scraped
3843
+ * total - Total URLs to scrape
3844
+ * creditsUsed - API credits consumed
3845
+ * expiresAt - Data expiration timestamp
3846
+
3847
+ Results:
3848
+ * data - List of scraped documents
3849
+ * next - URL for next page of results (if paginated)
3850
+ * success - Whether status check succeeded
3851
+ * error - Error message if failed
3852
+
3853
+ Raises:
3854
+ Exception: If status check fails
3855
+ """
3856
+ headers = self._prepare_headers()
3857
+ endpoint = f'/v1/batch/scrape/{id}'
3858
+
3859
+ status_data = await self._async_get_request(
3860
+ f'{self.api_url}{endpoint}',
3861
+ headers
3862
+ )
3863
+
3864
+ if status_data['status'] == 'completed':
3865
+ if 'data' in status_data:
3866
+ data = status_data['data']
3867
+ while 'next' in status_data:
3868
+ if len(status_data['data']) == 0:
3869
+ break
3870
+ next_url = status_data.get('next')
3871
+ if not next_url:
3872
+ logger.warning("Expected 'next' URL is missing.")
3873
+ break
3874
+ next_data = await self._async_get_request(next_url, headers)
3875
+ data.extend(next_data.get('data', []))
3876
+ status_data = next_data
3877
+ status_data['data'] = data
3878
+
3879
+ response = BatchScrapeStatusResponse(
3880
+ status=status_data.get('status'),
3881
+ total=status_data.get('total'),
3882
+ completed=status_data.get('completed'),
3883
+ creditsUsed=status_data.get('creditsUsed'),
3884
+ expiresAt=status_data.get('expiresAt'),
3885
+ data=status_data.get('data')
3886
+ )
3887
+
3888
+ if 'error' in status_data:
3889
+ response['error'] = status_data['error']
3890
+
3891
+ if 'next' in status_data:
3892
+ response['next'] = status_data['next']
3893
+
3894
+ return {
3895
+ 'success': False if 'error' in status_data else True,
3896
+ **response
3897
+ }
3898
+
3899
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3900
+ """
3901
+ Get information about errors from an asynchronous batch scrape job.
3902
+
3903
+ Args:
3904
+ id (str): The ID of the batch scrape job
3905
+
3906
+ Returns:
3907
+ CrawlErrorsResponse containing:
3908
+ errors (List[Dict[str, str]]): List of errors with fields:
3909
+ * id (str): Error ID
3910
+ * timestamp (str): When the error occurred
3911
+ * url (str): URL that caused the error
3912
+ * error (str): Error message
3913
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3914
+
3915
+ Raises:
3916
+ Exception: If error check fails
3917
+ """
3918
+ headers = self._prepare_headers()
3919
+ return await self._async_get_request(
3920
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3921
+ headers
3922
+ )
3923
+
3924
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3925
+ """
3926
+ Get information about errors from an asynchronous crawl job.
3927
+
3928
+ Args:
3929
+ id (str): The ID of the crawl job
3930
+
3931
+ Returns:
3932
+ CrawlErrorsResponse containing:
3933
+ * errors (List[Dict[str, str]]): List of errors with fields:
3934
+ - id (str): Error ID
3935
+ - timestamp (str): When the error occurred
3936
+ - url (str): URL that caused the error
3937
+ - error (str): Error message
3938
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3939
+
3940
+ Raises:
3941
+ Exception: If error check fails
3942
+ """
3943
+ headers = self._prepare_headers()
3944
+ return await self._async_get_request(
3945
+ f'{self.api_url}/v1/crawl/{id}/errors',
3946
+ headers
3947
+ )
3948
+
3949
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3950
+ """
3951
+ Cancel an asynchronous crawl job.
3952
+
3953
+ Args:
3954
+ id (str): The ID of the crawl job to cancel
3955
+
3956
+ Returns:
3957
+ Dict[str, Any] containing:
3958
+ * success (bool): Whether cancellation was successful
3959
+ * error (str, optional): Error message if cancellation failed
3960
+
3961
+ Raises:
3962
+ Exception: If cancellation fails
3963
+ """
3964
+ headers = self._prepare_headers()
3965
+ async with aiohttp.ClientSession() as session:
3966
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3967
+ return await response.json()
3968
+
3969
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3970
+ """
3971
+ Check the status of an asynchronous extraction job.
3972
+
3973
+ Args:
3974
+ job_id (str): The ID of the extraction job
3975
+
3976
+ Returns:
3977
+ ExtractResponse[Any] with:
3978
+ * success (bool): Whether request succeeded
3979
+ * data (Optional[Any]): Extracted data matching schema
3980
+ * error (Optional[str]): Error message if any
3981
+ * warning (Optional[str]): Warning message if any
3982
+ * sources (Optional[List[str]]): Source URLs if requested
3983
+
3984
+ Raises:
3985
+ ValueError: If status check fails
3986
+ """
3987
+ headers = self._prepare_headers()
3988
+ try:
3989
+ return await self._async_get_request(
3990
+ f'{self.api_url}/v1/extract/{job_id}',
3991
+ headers
3992
+ )
3993
+ except Exception as e:
3994
+ raise ValueError(str(e))
3995
+
3996
+ async def async_extract(
3997
+ self,
3998
+ urls: Optional[List[str]] = None,
3999
+ *,
4000
+ prompt: Optional[str] = None,
4001
+ schema: Optional[Any] = None,
4002
+ system_prompt: Optional[str] = None,
4003
+ allow_external_links: Optional[bool] = False,
4004
+ enable_web_search: Optional[bool] = False,
4005
+ show_sources: Optional[bool] = False,
4006
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
4007
+ """
4008
+ Initiate an asynchronous extraction job without waiting for completion.
4009
+
4010
+ Args:
4011
+ urls (Optional[List[str]]): URLs to extract from
4012
+ prompt (Optional[str]): Custom extraction prompt
4013
+ schema (Optional[Any]): JSON schema/Pydantic model
4014
+ system_prompt (Optional[str]): System context
4015
+ allow_external_links (Optional[bool]): Follow external links
4016
+ enable_web_search (Optional[bool]): Enable web search
4017
+ show_sources (Optional[bool]): Include source URLs
4018
+ agent (Optional[Dict[str, Any]]): Agent configuration
4019
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
4020
+
4021
+ Returns:
4022
+ ExtractResponse[Any] with:
4023
+ * success (bool): Whether request succeeded
4024
+ * data (Optional[Any]): Extracted data matching schema
4025
+ * error (Optional[str]): Error message if any
4026
+
4027
+ Raises:
4028
+ ValueError: If job initiation fails
4029
+ """
4030
+ headers = self._prepare_headers()
4031
+
4032
+ if not prompt and not schema:
4033
+ raise ValueError("Either prompt or schema is required")
4034
+
4035
+ if not urls and not prompt:
4036
+ raise ValueError("Either urls or prompt is required")
4037
+
4038
+ if schema:
4039
+ schema = self._ensure_schema_dict(schema)
4040
+
4041
+ request_data = ExtractResponse(
4042
+ urls=urls or [],
4043
+ allowExternalLinks=allow_external_links,
4044
+ enableWebSearch=enable_web_search,
4045
+ showSources=show_sources,
4046
+ schema=schema,
4047
+ origin=f'python-sdk@{version}'
4048
+ )
4049
+
4050
+ if prompt:
4051
+ request_data['prompt'] = prompt
4052
+ if system_prompt:
4053
+ request_data['systemPrompt'] = system_prompt
4054
+ if agent:
4055
+ request_data['agent'] = agent
4056
+
4057
+ try:
4058
+ return await self._async_post_request(
4059
+ f'{self.api_url}/v1/extract',
4060
+ request_data,
4061
+ headers
4062
+ )
4063
+ except Exception as e:
4064
+ raise ValueError(str(e))
4065
+
4066
+ async def generate_llms_text(
4067
+ self,
4068
+ url: str,
4069
+ *,
4070
+ max_urls: Optional[int] = None,
4071
+ show_full_text: Optional[bool] = None,
4072
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
4073
+ """
4074
+ Generate LLMs.txt for a given URL and monitor until completion.
4075
+
4076
+ Args:
4077
+ url (str): Target URL to generate LLMs.txt from
4078
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4079
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4080
+ experimental_stream (Optional[bool]): Enable experimental streaming
4081
+
4082
+ Returns:
4083
+ GenerateLLMsTextStatusResponse containing:
4084
+ * success (bool): Whether generation completed successfully
4085
+ * status (str): Status of generation (processing/completed/failed)
4086
+ * data (Dict[str, str], optional): Generated text with fields:
4087
+ - llmstxt (str): Generated LLMs.txt content
4088
+ - llmsfulltxt (str, optional): Full version if requested
4089
+ * error (str, optional): Error message if generation failed
4090
+ * expiresAt (str): When the generated data expires
4091
+
4092
+ Raises:
4093
+ Exception: If generation fails
4094
+ """
4095
+ params = {}
4096
+ if max_urls is not None:
4097
+ params['maxUrls'] = max_urls
4098
+ if show_full_text is not None:
4099
+ params['showFullText'] = show_full_text
4100
+ if experimental_stream is not None:
4101
+ params['__experimental_stream'] = experimental_stream
4102
+
4103
+ response = await self.async_generate_llms_text(
4104
+ url,
4105
+ max_urls=max_urls,
4106
+ show_full_text=show_full_text,
4107
+ cache=cache,
4108
+ experimental_stream=experimental_stream
4109
+ )
4110
+ if not response.get('success') or 'id' not in response:
4111
+ return response
4112
+
4113
+ job_id = response['id']
4114
+ while True:
4115
+ status = await self.check_generate_llms_text_status(job_id)
4116
+
4117
+ if status['status'] == 'completed':
4118
+ return status
4119
+ elif status['status'] == 'failed':
4120
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4121
+ elif status['status'] != 'processing':
4122
+ break
4123
+
4124
+ await asyncio.sleep(2)
4125
+
4126
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4127
+
4128
+ async def async_generate_llms_text(
4129
+ self,
4130
+ url: str,
4131
+ *,
4132
+ max_urls: Optional[int] = None,
4133
+ show_full_text: Optional[bool] = None,
4134
+ cache: Optional[bool] = None,
4135
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4136
+ """
4137
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4138
+
4139
+ Args:
4140
+ url (str): Target URL to generate LLMs.txt from
4141
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4142
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4143
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
4144
+ experimental_stream (Optional[bool]): Enable experimental streaming
4145
+
4146
+ Returns:
4147
+ GenerateLLMsTextResponse containing:
4148
+ * success (bool): Whether job started successfully
4149
+ * id (str): Unique identifier for the job
4150
+ * error (str, optional): Error message if start failed
4151
+
4152
+ Raises:
4153
+ ValueError: If job initiation fails
4154
+ """
4155
+ params = {}
4156
+ if max_urls is not None:
4157
+ params['maxUrls'] = max_urls
4158
+ if show_full_text is not None:
4159
+ params['showFullText'] = show_full_text
4160
+ if experimental_stream is not None:
4161
+ params['__experimental_stream'] = experimental_stream
4162
+
4163
+ params = GenerateLLMsTextParams(
4164
+ maxUrls=max_urls,
4165
+ showFullText=show_full_text,
4166
+ cache=cache,
4167
+ __experimental_stream=experimental_stream
4168
+ )
4169
+
4170
+ headers = self._prepare_headers()
4171
+ json_data = {'url': url, **params.dict(exclude_none=True)}
4172
+ json_data['origin'] = f"python-sdk@{version}"
4173
+
4174
+ try:
4175
+ return await self._async_post_request(
4176
+ f'{self.api_url}/v1/llmstxt',
4177
+ json_data,
4178
+ headers
4179
+ )
4180
+ except Exception as e:
4181
+ raise ValueError(str(e))
4182
+
4183
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4184
+ """
4185
+ Check the status of an asynchronous LLMs.txt generation job.
4186
+
4187
+ Args:
4188
+ id (str): The ID of the generation job
4189
+
4190
+ Returns:
4191
+ GenerateLLMsTextStatusResponse containing:
4192
+ * success (bool): Whether generation completed successfully
4193
+ * status (str): Status of generation (processing/completed/failed)
4194
+ * data (Dict[str, str], optional): Generated text with fields:
4195
+ - llmstxt (str): Generated LLMs.txt content
4196
+ - llmsfulltxt (str, optional): Full version if requested
4197
+ * error (str, optional): Error message if generation failed
4198
+ * expiresAt (str): When the generated data expires
4199
+
4200
+ Raises:
4201
+ ValueError: If status check fails
4202
+ """
4203
+ headers = self._prepare_headers()
4204
+ try:
4205
+ return await self._async_get_request(
4206
+ f'{self.api_url}/v1/llmstxt/{id}',
4207
+ headers
4208
+ )
4209
+ except Exception as e:
4210
+ raise ValueError(str(e))
4211
+
4212
+ async def deep_research(
4213
+ self,
4214
+ query: str,
4215
+ *,
4216
+ max_depth: Optional[int] = None,
4217
+ time_limit: Optional[int] = None,
4218
+ max_urls: Optional[int] = None,
4219
+ analysis_prompt: Optional[str] = None,
4220
+ system_prompt: Optional[str] = None,
4221
+ __experimental_stream_steps: Optional[bool] = None,
4222
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4223
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4224
+ """
4225
+ Initiates a deep research operation on a given query and polls until completion.
4226
+
4227
+ Args:
4228
+ query (str): Research query or topic to investigate
4229
+ max_depth (Optional[int]): Maximum depth of research exploration
4230
+ time_limit (Optional[int]): Time limit in seconds for research
4231
+ max_urls (Optional[int]): Maximum number of URLs to process
4232
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4233
+ system_prompt (Optional[str]): Custom system prompt
4234
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4235
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4236
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4237
+
4238
+ Returns:
4239
+ DeepResearchStatusResponse containing:
4240
+ * success (bool): Whether research completed successfully
4241
+ * status (str): Current state (processing/completed/failed)
4242
+ * error (Optional[str]): Error message if failed
4243
+ * id (str): Unique identifier for the research job
4244
+ * data (Any): Research findings and analysis
4245
+ * sources (List[Dict]): List of discovered sources
4246
+ * activities (List[Dict]): Research progress log
4247
+ * summaries (List[str]): Generated research summaries
4248
+
4249
+ Raises:
4250
+ Exception: If research fails
4251
+ """
4252
+ research_params = {}
4253
+ if max_depth is not None:
4254
+ research_params['maxDepth'] = max_depth
4255
+ if time_limit is not None:
4256
+ research_params['timeLimit'] = time_limit
4257
+ if max_urls is not None:
4258
+ research_params['maxUrls'] = max_urls
4259
+ if analysis_prompt is not None:
4260
+ research_params['analysisPrompt'] = analysis_prompt
4261
+ if system_prompt is not None:
4262
+ research_params['systemPrompt'] = system_prompt
4263
+ if __experimental_stream_steps is not None:
4264
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4265
+ research_params = DeepResearchParams(**research_params)
4266
+
4267
+ response = await self.async_deep_research(
4268
+ query,
4269
+ max_depth=max_depth,
4270
+ time_limit=time_limit,
4271
+ max_urls=max_urls,
4272
+ analysis_prompt=analysis_prompt,
4273
+ system_prompt=system_prompt
4274
+ )
4275
+ if not response.get('success') or 'id' not in response:
4276
+ return response
4277
+
4278
+ job_id = response['id']
4279
+ last_activity_count = 0
4280
+ last_source_count = 0
4281
+
4282
+ while True:
4283
+ status = await self.check_deep_research_status(job_id)
4284
+
4285
+ if on_activity and 'activities' in status:
4286
+ new_activities = status['activities'][last_activity_count:]
4287
+ for activity in new_activities:
4288
+ on_activity(activity)
4289
+ last_activity_count = len(status['activities'])
4290
+
4291
+ if on_source and 'sources' in status:
4292
+ new_sources = status['sources'][last_source_count:]
4293
+ for source in new_sources:
4294
+ on_source(source)
4295
+ last_source_count = len(status['sources'])
4296
+
4297
+ if status['status'] == 'completed':
4298
+ return status
4299
+ elif status['status'] == 'failed':
4300
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4301
+ elif status['status'] != 'processing':
4302
+ break
4303
+
4304
+ await asyncio.sleep(2)
4305
+
4306
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4307
+
4308
+ async def async_deep_research(
4309
+ self,
4310
+ query: str,
4311
+ *,
4312
+ max_depth: Optional[int] = None,
4313
+ time_limit: Optional[int] = None,
4314
+ max_urls: Optional[int] = None,
4315
+ analysis_prompt: Optional[str] = None,
4316
+ system_prompt: Optional[str] = None,
4317
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4318
+ """
4319
+ Initiates an asynchronous deep research operation.
4320
+
4321
+ Args:
4322
+ query (str): Research query or topic to investigate
4323
+ max_depth (Optional[int]): Maximum depth of research exploration
4324
+ time_limit (Optional[int]): Time limit in seconds for research
4325
+ max_urls (Optional[int]): Maximum number of URLs to process
4326
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4327
+ system_prompt (Optional[str]): Custom system prompt
4328
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4329
+
4330
+ Returns:
4331
+ Dict[str, Any]: A response containing:
4332
+ * success (bool): Whether the research initiation was successful
4333
+ * id (str): The unique identifier for the research job
4334
+ * error (str, optional): Error message if initiation failed
4335
+
4336
+ Raises:
4337
+ Exception: If the research initiation fails.
4338
+ """
4339
+ research_params = {}
4340
+ if max_depth is not None:
4341
+ research_params['maxDepth'] = max_depth
4342
+ if time_limit is not None:
4343
+ research_params['timeLimit'] = time_limit
4344
+ if max_urls is not None:
4345
+ research_params['maxUrls'] = max_urls
4346
+ if analysis_prompt is not None:
4347
+ research_params['analysisPrompt'] = analysis_prompt
4348
+ if system_prompt is not None:
4349
+ research_params['systemPrompt'] = system_prompt
4350
+ if __experimental_stream_steps is not None:
4351
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4352
+ research_params = DeepResearchParams(**research_params)
4353
+
4354
+ headers = self._prepare_headers()
4355
+
4356
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4357
+ json_data['origin'] = f"python-sdk@{version}"
4358
+
4359
+ try:
4360
+ return await self._async_post_request(
4361
+ f'{self.api_url}/v1/deep-research',
4362
+ json_data,
4363
+ headers
4364
+ )
4365
+ except Exception as e:
4366
+ raise ValueError(str(e))
4367
+
4368
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4369
+ """
4370
+ Check the status of a deep research operation.
4371
+
4372
+ Args:
4373
+ id (str): The ID of the deep research operation.
4374
+
4375
+ Returns:
4376
+ DeepResearchResponse containing:
4377
+
4378
+ Status:
4379
+ * success - Whether research completed successfully
4380
+ * status - Current state (processing/completed/failed)
4381
+ * error - Error message if failed
4382
+
4383
+ Results:
4384
+ * id - Unique identifier for the research job
4385
+ * data - Research findings and analysis
4386
+ * sources - List of discovered sources
4387
+ * activities - Research progress log
4388
+ * summaries - Generated research summaries
4389
+
4390
+ Raises:
4391
+ Exception: If the status check fails.
4392
+ """
4393
+ headers = self._prepare_headers()
4394
+ try:
4395
+ return await self._async_get_request(
4396
+ f'{self.api_url}/v1/deep-research/{id}',
4397
+ headers
4398
+ )
4399
+ except Exception as e:
4400
+ raise ValueError(str(e))
4401
+
4402
+ async def search(
4403
+ self,
4404
+ query: str,
4405
+ *,
4406
+ limit: Optional[int] = None,
4407
+ tbs: Optional[str] = None,
4408
+ filter: Optional[str] = None,
4409
+ lang: Optional[str] = None,
4410
+ country: Optional[str] = None,
4411
+ location: Optional[str] = None,
4412
+ timeout: Optional[int] = None,
4413
+ scrape_options: Optional[ScrapeOptions] = None,
4414
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4415
+ **kwargs) -> SearchResponse:
4416
+ """
4417
+ Asynchronously search for content using Firecrawl.
4418
+
4419
+ Args:
4420
+ query (str): Search query string
4421
+ limit (Optional[int]): Max results (default: 5)
4422
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4423
+ filter (Optional[str]): Custom result filter
4424
+ lang (Optional[str]): Language code (default: "en")
4425
+ country (Optional[str]): Country code (default: "us")
4426
+ location (Optional[str]): Geo-targeting
4427
+ timeout (Optional[int]): Request timeout in milliseconds
4428
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4429
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4430
+ **kwargs: Additional keyword arguments for future compatibility
4431
+
4432
+ Returns:
4433
+ SearchResponse: Response containing:
4434
+ * success (bool): Whether request succeeded
4435
+ * data (List[FirecrawlDocument]): Search results
4436
+ * warning (Optional[str]): Warning message if any
4437
+ * error (Optional[str]): Error message if any
4438
+
4439
+ Raises:
4440
+ Exception: If search fails or response cannot be parsed
4441
+ """
4442
+ # Build search parameters
4443
+ search_params = {}
4444
+ if params:
4445
+ if isinstance(params, dict):
4446
+ search_params.update(params)
4447
+ else:
4448
+ search_params.update(params.dict(exclude_none=True))
4449
+
4450
+ # Add individual parameters
4451
+ if limit is not None:
4452
+ search_params['limit'] = limit
4453
+ if tbs is not None:
4454
+ search_params['tbs'] = tbs
4455
+ if filter is not None:
4456
+ search_params['filter'] = filter
4457
+ if lang is not None:
4458
+ search_params['lang'] = lang
4459
+ if country is not None:
4460
+ search_params['country'] = country
4461
+ if location is not None:
4462
+ search_params['location'] = location
4463
+ if timeout is not None:
4464
+ search_params['timeout'] = timeout
4465
+ if scrape_options is not None:
4466
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4467
+
4468
+ # Add any additional kwargs
4469
+ search_params.update(kwargs)
4470
+
4471
+ # Create final params object
4472
+ final_params = SearchParams(query=query, **search_params)
4473
+ params_dict = final_params.dict(exclude_none=True)
4474
+ params_dict['origin'] = f"python-sdk@{version}"
4475
+
4476
+ return await self._async_post_request(
4477
+ f"{self.api_url}/v1/search",
4478
+ params_dict,
4479
+ {"Authorization": f"Bearer {self.api_key}"}
4480
+ )
4481
+
4482
+ class AsyncCrawlWatcher(CrawlWatcher):
4483
+ """
4484
+ Async version of CrawlWatcher that properly handles async operations.
4485
+ """
4486
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4487
+ super().__init__(id, app)
4488
+
4489
+ async def connect(self) -> None:
4490
+ """
4491
+ Establishes async WebSocket connection and starts listening for messages.
4492
+ """
4493
+ async with websockets.connect(
4494
+ self.ws_url,
4495
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4496
+ ) as websocket:
4497
+ await self._listen(websocket)
4498
+
4499
+ async def _listen(self, websocket) -> None:
4500
+ """
4501
+ Listens for incoming WebSocket messages and handles them asynchronously.
4502
+
4503
+ Args:
4504
+ websocket: The WebSocket connection object
4505
+ """
4506
+ async for message in websocket:
4507
+ msg = json.loads(message)
4508
+ await self._handle_message(msg)
4509
+
4510
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4511
+ """
4512
+ Handles incoming WebSocket messages based on their type asynchronously.
4513
+
4514
+ Args:
4515
+ msg (Dict[str, Any]): The message to handle
4516
+ """
4517
+ if msg['type'] == 'done':
4518
+ self.status = 'completed'
4519
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4520
+ elif msg['type'] == 'error':
4521
+ self.status = 'failed'
4522
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4523
+ elif msg['type'] == 'catchup':
4524
+ self.status = msg['data']['status']
4525
+ self.data.extend(msg['data'].get('data', []))
4526
+ for doc in self.data:
4527
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4528
+ elif msg['type'] == 'document':
4529
+ self.data.append(msg['data'])
4530
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4531
+
4532
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4533
+ """
4534
+ Handle errors from async API responses.
4535
+ """
4536
+ try:
4537
+ error_data = await response.json()
4538
+ error_message = error_data.get('error', 'No error message provided.')
4539
+ error_details = error_data.get('details', 'No additional error details provided.')
4540
+ except:
4541
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4542
+
4543
+ # Use the app's method to get the error message
4544
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4545
+
4546
+ raise aiohttp.ClientError(message)
4547
+
4548
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4549
+ """
4550
+ Generate a standardized error message based on HTTP status code for async operations.
4551
+
4552
+ Args:
4553
+ status_code (int): The HTTP status code from the response
4554
+ action (str): Description of the action that was being performed
4555
+ error_message (str): The error message from the API response
4556
+ error_details (str): Additional error details from the API response
4557
+
4558
+ Returns:
4559
+ str: A formatted error message
4560
+ """
4561
+ return self._get_error_message(status_code, action, error_message, error_details)