firecrawl-py 2.16.1__py3-none-any.whl → 2.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

@@ -0,0 +1,4616 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
47
+
48
+ logger : logging.Logger = logging.getLogger("firecrawl")
49
+
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+ pdfs: List[str]
100
+
101
+ class ChangeTrackingData(pydantic.BaseModel):
102
+ """
103
+ Data for the change tracking format.
104
+ """
105
+ previousScrapeAt: Optional[str] = None
106
+ changeStatus: str # "new" | "same" | "changed" | "removed"
107
+ visibility: str # "visible" | "hidden"
108
+ diff: Optional[Dict[str, Any]] = None
109
+ json: Optional[Any] = None
110
+
111
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
112
+ """Document retrieved or processed by Firecrawl."""
113
+ url: Optional[str] = None
114
+ markdown: Optional[str] = None
115
+ html: Optional[str] = None
116
+ rawHtml: Optional[str] = None
117
+ links: Optional[List[str]] = None
118
+ extract: Optional[T] = None
119
+ json: Optional[T] = None
120
+ screenshot: Optional[str] = None
121
+ metadata: Optional[Any] = None
122
+ actions: Optional[ActionsResult] = None
123
+ title: Optional[str] = None # v1 search only
124
+ description: Optional[str] = None # v1 search only
125
+ changeTracking: Optional[ChangeTrackingData] = None
126
+
127
+ class LocationConfig(pydantic.BaseModel):
128
+ """Location configuration for scraping."""
129
+ country: Optional[str] = None
130
+ languages: Optional[List[str]] = None
131
+
132
+ class WebhookConfig(pydantic.BaseModel):
133
+ """Configuration for webhooks."""
134
+ url: str
135
+ headers: Optional[Dict[str, str]] = None
136
+ metadata: Optional[Dict[str, str]] = None
137
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
138
+
139
+ class ChangeTrackingOptions(pydantic.BaseModel):
140
+ """Configuration for change tracking."""
141
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
142
+ schema: Optional[Any] = None
143
+ prompt: Optional[str] = None
144
+ tag: Optional[str] = None
145
+
146
+ class ScrapeOptions(pydantic.BaseModel):
147
+ """Parameters for scraping operations."""
148
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
149
+ headers: Optional[Dict[str, str]] = None
150
+ includeTags: Optional[List[str]] = None
151
+ excludeTags: Optional[List[str]] = None
152
+ onlyMainContent: Optional[bool] = None
153
+ waitFor: Optional[int] = None
154
+ timeout: Optional[int] = None
155
+ location: Optional[LocationConfig] = None
156
+ mobile: Optional[bool] = None
157
+ skipTlsVerification: Optional[bool] = None
158
+ removeBase64Images: Optional[bool] = None
159
+ blockAds: Optional[bool] = None
160
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
161
+ changeTrackingOptions: Optional[ChangeTrackingOptions] = None
162
+ maxAge: Optional[int] = None
163
+ storeInCache: Optional[bool] = None
164
+ parsePDF: Optional[bool] = None
165
+
166
+ class WaitAction(pydantic.BaseModel):
167
+ """Wait action to perform during scraping."""
168
+ type: Literal["wait"]
169
+ milliseconds: Optional[int] = None
170
+ selector: Optional[str] = None
171
+
172
+ class ScreenshotAction(pydantic.BaseModel):
173
+ """Screenshot action to perform during scraping."""
174
+ type: Literal["screenshot"]
175
+ fullPage: Optional[bool] = None
176
+ quality: Optional[int] = None
177
+
178
+ class ClickAction(pydantic.BaseModel):
179
+ """Click action to perform during scraping."""
180
+ type: Literal["click"]
181
+ selector: str
182
+
183
+ class WriteAction(pydantic.BaseModel):
184
+ """Write action to perform during scraping."""
185
+ type: Literal["write"]
186
+ text: str
187
+
188
+ class PressAction(pydantic.BaseModel):
189
+ """Press action to perform during scraping."""
190
+ type: Literal["press"]
191
+ key: str
192
+
193
+ class ScrollAction(pydantic.BaseModel):
194
+ """Scroll action to perform during scraping."""
195
+ type: Literal["scroll"]
196
+ direction: Literal["up", "down"]
197
+ selector: Optional[str] = None
198
+
199
+ class ScrapeAction(pydantic.BaseModel):
200
+ """Scrape action to perform during scraping."""
201
+ type: Literal["scrape"]
202
+
203
+ class ExecuteJavascriptAction(pydantic.BaseModel):
204
+ """Execute javascript action to perform during scraping."""
205
+ type: Literal["executeJavascript"]
206
+ script: str
207
+
208
+ class PDFAction(pydantic.BaseModel):
209
+ """PDF action to perform during scraping."""
210
+ type: Literal["pdf"]
211
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
212
+ landscape: Optional[bool] = None
213
+ scale: Optional[float] = None
214
+
215
+ class ExtractAgent(pydantic.BaseModel):
216
+ """Configuration for the agent in extract operations."""
217
+ model: Literal["FIRE-1"] = "FIRE-1"
218
+
219
+ class JsonConfig(pydantic.BaseModel):
220
+ """Configuration for extraction."""
221
+ prompt: Optional[str] = None
222
+ schema: Optional[Any] = None
223
+ systemPrompt: Optional[str] = None
224
+ agent: Optional[ExtractAgent] = None
225
+
226
+ class ScrapeParams(ScrapeOptions):
227
+ """Parameters for scraping operations."""
228
+ extract: Optional[JsonConfig] = None
229
+ jsonOptions: Optional[JsonConfig] = None
230
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
231
+ agent: Optional[AgentOptions] = None
232
+ webhook: Optional[WebhookConfig] = None
233
+
234
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
235
+ """Response from scraping operations."""
236
+ success: bool = True
237
+ warning: Optional[str] = None
238
+ error: Optional[str] = None
239
+
240
+ class BatchScrapeResponse(pydantic.BaseModel):
241
+ """Response from batch scrape operations."""
242
+ id: Optional[str] = None
243
+ url: Optional[str] = None
244
+ success: bool = True
245
+ error: Optional[str] = None
246
+ invalidURLs: Optional[List[str]] = None
247
+
248
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
249
+ """Response from batch scrape status checks."""
250
+ success: bool = True
251
+ status: Literal["scraping", "completed", "failed", "cancelled"]
252
+ completed: int
253
+ total: int
254
+ creditsUsed: int
255
+ expiresAt: datetime
256
+ next: Optional[str] = None
257
+ data: List[FirecrawlDocument]
258
+
259
+ class CrawlParams(pydantic.BaseModel):
260
+ """Parameters for crawling operations."""
261
+ includePaths: Optional[List[str]] = None
262
+ excludePaths: Optional[List[str]] = None
263
+ maxDepth: Optional[int] = None
264
+ maxDiscoveryDepth: Optional[int] = None
265
+ limit: Optional[int] = None
266
+ allowBackwardLinks: Optional[bool] = None
267
+ allowExternalLinks: Optional[bool] = None
268
+ ignoreSitemap: Optional[bool] = None
269
+ scrapeOptions: Optional[ScrapeOptions] = None
270
+ webhook: Optional[Union[str, WebhookConfig]] = None
271
+ deduplicateSimilarURLs: Optional[bool] = None
272
+ ignoreQueryParameters: Optional[bool] = None
273
+ regexOnFullURL: Optional[bool] = None
274
+ delay: Optional[int] = None # Delay in seconds between scrapes
275
+ maxConcurrency: Optional[int] = None
276
+ allowSubdomains: Optional[bool] = None
277
+
278
+ class CrawlResponse(pydantic.BaseModel):
279
+ """Response from crawling operations."""
280
+ id: Optional[str] = None
281
+ url: Optional[str] = None
282
+ success: bool = True
283
+ error: Optional[str] = None
284
+
285
+ class CrawlStatusResponse(pydantic.BaseModel):
286
+ """Response from crawl status checks."""
287
+ success: bool = True
288
+ status: Literal["scraping", "completed", "failed", "cancelled"]
289
+ completed: int
290
+ total: int
291
+ creditsUsed: int
292
+ expiresAt: datetime
293
+ next: Optional[str] = None
294
+ data: List[FirecrawlDocument]
295
+
296
+ class CrawlErrorsResponse(pydantic.BaseModel):
297
+ """Response from crawl/batch scrape error monitoring."""
298
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
299
+ robotsBlocked: List[str]
300
+
301
+ class MapParams(pydantic.BaseModel):
302
+ """Parameters for mapping operations."""
303
+ search: Optional[str] = None
304
+ ignoreSitemap: Optional[bool] = None
305
+ includeSubdomains: Optional[bool] = None
306
+ sitemapOnly: Optional[bool] = None
307
+ limit: Optional[int] = None
308
+ timeout: Optional[int] = None
309
+ useIndex: Optional[bool] = None
310
+
311
+ class MapResponse(pydantic.BaseModel):
312
+ """Response from mapping operations."""
313
+ success: bool = True
314
+ links: Optional[List[str]] = None
315
+ error: Optional[str] = None
316
+
317
+ class ExtractParams(pydantic.BaseModel):
318
+ """Parameters for extracting information from URLs."""
319
+ prompt: Optional[str] = None
320
+ schema: Optional[Any] = None
321
+ systemPrompt: Optional[str] = None
322
+ allowExternalLinks: Optional[bool] = None
323
+ enableWebSearch: Optional[bool] = None
324
+ includeSubdomains: Optional[bool] = None
325
+ origin: Optional[str] = None
326
+ showSources: Optional[bool] = None
327
+ scrapeOptions: Optional[ScrapeOptions] = None
328
+
329
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
330
+ """Response from extract operations."""
331
+ id: Optional[str] = None
332
+ status: Optional[Literal["processing", "completed", "failed"]] = None
333
+ expiresAt: Optional[datetime] = None
334
+ success: bool = True
335
+ data: Optional[T] = None
336
+ error: Optional[str] = None
337
+ warning: Optional[str] = None
338
+ sources: Optional[Dict[Any, Any]] = None
339
+
340
+ class SearchParams(pydantic.BaseModel):
341
+ query: str
342
+ limit: Optional[int] = 5
343
+ tbs: Optional[str] = None
344
+ filter: Optional[str] = None
345
+ lang: Optional[str] = "en"
346
+ country: Optional[str] = "us"
347
+ location: Optional[str] = None
348
+ origin: Optional[str] = "api"
349
+ timeout: Optional[int] = 60000
350
+ scrapeOptions: Optional[ScrapeOptions] = None
351
+
352
+ class SearchResponse(pydantic.BaseModel):
353
+ """Response from search operations."""
354
+ success: bool = True
355
+ data: List[FirecrawlDocument]
356
+ warning: Optional[str] = None
357
+ error: Optional[str] = None
358
+
359
+ class GenerateLLMsTextParams(pydantic.BaseModel):
360
+ """
361
+ Parameters for the LLMs.txt generation operation.
362
+ """
363
+ maxUrls: Optional[int] = 10
364
+ showFullText: Optional[bool] = False
365
+ cache: Optional[bool] = True
366
+ __experimental_stream: Optional[bool] = None
367
+
368
+ class DeepResearchParams(pydantic.BaseModel):
369
+ """
370
+ Parameters for the deep research operation.
371
+ """
372
+ maxDepth: Optional[int] = 7
373
+ timeLimit: Optional[int] = 270
374
+ maxUrls: Optional[int] = 20
375
+ analysisPrompt: Optional[str] = None
376
+ systemPrompt: Optional[str] = None
377
+ __experimental_streamSteps: Optional[bool] = None
378
+
379
+ class DeepResearchResponse(pydantic.BaseModel):
380
+ """
381
+ Response from the deep research operation.
382
+ """
383
+ success: bool
384
+ id: str
385
+ error: Optional[str] = None
386
+
387
+ class DeepResearchStatusResponse(pydantic.BaseModel):
388
+ """
389
+ Status response from the deep research operation.
390
+ """
391
+ success: bool
392
+ data: Optional[Dict[str, Any]] = None
393
+ status: str
394
+ error: Optional[str] = None
395
+ expiresAt: str
396
+ currentDepth: int
397
+ maxDepth: int
398
+ activities: List[Dict[str, Any]]
399
+ sources: List[Dict[str, Any]]
400
+ summaries: List[str]
401
+
402
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
403
+ """Response from LLMs.txt generation operations."""
404
+ success: bool = True
405
+ id: str
406
+ error: Optional[str] = None
407
+
408
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
409
+ llmstxt: str
410
+ llmsfulltxt: Optional[str] = None
411
+
412
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
413
+ """Status response from LLMs.txt generation operations."""
414
+ success: bool = True
415
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
416
+ status: Literal["processing", "completed", "failed"]
417
+ error: Optional[str] = None
418
+ expiresAt: str
419
+
420
+ class SearchResponse(pydantic.BaseModel):
421
+ """
422
+ Response from the search operation.
423
+ """
424
+ success: bool
425
+ data: List[Dict[str, Any]]
426
+ warning: Optional[str] = None
427
+ error: Optional[str] = None
428
+
429
+ class ExtractParams(pydantic.BaseModel):
430
+ """
431
+ Parameters for the extract operation.
432
+ """
433
+ prompt: Optional[str] = None
434
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
435
+ system_prompt: Optional[str] = None
436
+ allow_external_links: Optional[bool] = False
437
+ enable_web_search: Optional[bool] = False
438
+ # Just for backwards compatibility
439
+ enableWebSearch: Optional[bool] = False
440
+ show_sources: Optional[bool] = False
441
+ agent: Optional[Dict[str, Any]] = None
442
+
443
+ class FirecrawlApp:
444
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
445
+ """
446
+ Initialize the FirecrawlApp instance with API key, API URL.
447
+
448
+ Args:
449
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
450
+ api_url (Optional[str]): Base URL for the Firecrawl API.
451
+ """
452
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
453
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
454
+
455
+ # Only require API key when using cloud service
456
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
457
+ logger.warning("No API key provided for cloud service")
458
+ raise ValueError('No API key provided')
459
+
460
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
461
+
462
+ def scrape_url(
463
+ self,
464
+ url: str,
465
+ *,
466
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
467
+ headers: Optional[Dict[str, str]] = None,
468
+ include_tags: Optional[List[str]] = None,
469
+ exclude_tags: Optional[List[str]] = None,
470
+ only_main_content: Optional[bool] = None,
471
+ wait_for: Optional[int] = None,
472
+ timeout: Optional[int] = None,
473
+ location: Optional[LocationConfig] = None,
474
+ mobile: Optional[bool] = None,
475
+ skip_tls_verification: Optional[bool] = None,
476
+ remove_base64_images: Optional[bool] = None,
477
+ block_ads: Optional[bool] = None,
478
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
479
+ parse_pdf: Optional[bool] = None,
480
+ extract: Optional[JsonConfig] = None,
481
+ json_options: Optional[JsonConfig] = None,
482
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
483
+ change_tracking_options: Optional[ChangeTrackingOptions] = None,
484
+ max_age: Optional[int] = None,
485
+ store_in_cache: Optional[bool] = None,
486
+ zero_data_retention: Optional[bool] = None,
487
+ **kwargs) -> ScrapeResponse[Any]:
488
+ """
489
+ Scrape and extract content from a URL.
490
+
491
+ Args:
492
+ url (str): Target URL to scrape
493
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
494
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
495
+ include_tags (Optional[List[str]]): HTML tags to include
496
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
497
+ only_main_content (Optional[bool]): Extract main content only
498
+ wait_for (Optional[int]): Wait for a specific element to appear
499
+ timeout (Optional[int]): Request timeout (ms)
500
+ location (Optional[LocationConfig]): Location configuration
501
+ mobile (Optional[bool]): Use mobile user agent
502
+ skip_tls_verification (Optional[bool]): Skip TLS verification
503
+ remove_base64_images (Optional[bool]): Remove base64 images
504
+ block_ads (Optional[bool]): Block ads
505
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
506
+ extract (Optional[JsonConfig]): Content extraction settings
507
+ json_options (Optional[JsonConfig]): JSON extraction settings
508
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
509
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
510
+ zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
511
+
512
+
513
+ Returns:
514
+ ScrapeResponse with:
515
+ * Requested content formats
516
+ * Page metadata
517
+ * Extraction results
518
+ * Success/error status
519
+
520
+ Raises:
521
+ Exception: If scraping fails
522
+ """
523
+ _headers = self._prepare_headers()
524
+
525
+ # Build scrape parameters
526
+ scrape_params = {
527
+ 'url': url,
528
+ 'origin': f"python-sdk@{version}"
529
+ }
530
+
531
+ # Add optional parameters if provided
532
+ if formats:
533
+ scrape_params['formats'] = formats
534
+ if headers:
535
+ scrape_params['headers'] = headers
536
+ if include_tags:
537
+ scrape_params['includeTags'] = include_tags
538
+ if exclude_tags:
539
+ scrape_params['excludeTags'] = exclude_tags
540
+ if only_main_content is not None:
541
+ scrape_params['onlyMainContent'] = only_main_content
542
+ if wait_for:
543
+ scrape_params['waitFor'] = wait_for
544
+ if timeout:
545
+ scrape_params['timeout'] = timeout
546
+ if location:
547
+ scrape_params['location'] = location.dict(exclude_none=True)
548
+ if mobile is not None:
549
+ scrape_params['mobile'] = mobile
550
+ if skip_tls_verification is not None:
551
+ scrape_params['skipTlsVerification'] = skip_tls_verification
552
+ if remove_base64_images is not None:
553
+ scrape_params['removeBase64Images'] = remove_base64_images
554
+ if block_ads is not None:
555
+ scrape_params['blockAds'] = block_ads
556
+ if proxy:
557
+ scrape_params['proxy'] = proxy
558
+ if parse_pdf is not None:
559
+ scrape_params['parsePDF'] = parse_pdf
560
+ if extract is not None:
561
+ extract = self._ensure_schema_dict(extract)
562
+ if isinstance(extract, dict) and "schema" in extract:
563
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
564
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
565
+ if json_options is not None:
566
+ json_options = self._ensure_schema_dict(json_options)
567
+ if isinstance(json_options, dict) and "schema" in json_options:
568
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
569
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
570
+ if actions:
571
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
572
+ if change_tracking_options:
573
+ scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
574
+ if max_age is not None:
575
+ scrape_params['maxAge'] = max_age
576
+ if store_in_cache is not None:
577
+ scrape_params['storeInCache'] = store_in_cache
578
+ if zero_data_retention is not None:
579
+ scrape_params['zeroDataRetention'] = zero_data_retention
580
+
581
+ scrape_params.update(kwargs)
582
+
583
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
584
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
585
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
586
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
587
+
588
+ # Make request
589
+ response = requests.post(
590
+ f'{self.api_url}/v1/scrape',
591
+ headers=_headers,
592
+ json=scrape_params,
593
+ timeout=(timeout + 5000 if timeout else None)
594
+ )
595
+
596
+ if response.status_code == 200:
597
+ try:
598
+ response_json = response.json()
599
+ if response_json.get('success') and 'data' in response_json:
600
+ return ScrapeResponse(**response_json['data'])
601
+ elif "error" in response_json:
602
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
603
+ else:
604
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
605
+ except ValueError:
606
+ raise Exception('Failed to parse Firecrawl response as JSON.')
607
+ else:
608
+ self._handle_error(response, 'scrape URL')
609
+
610
+ def search(
611
+ self,
612
+ query: str,
613
+ *,
614
+ limit: Optional[int] = None,
615
+ tbs: Optional[str] = None,
616
+ filter: Optional[str] = None,
617
+ lang: Optional[str] = None,
618
+ country: Optional[str] = None,
619
+ location: Optional[str] = None,
620
+ timeout: Optional[int] = None,
621
+ scrape_options: Optional[ScrapeOptions] = None,
622
+ **kwargs) -> SearchResponse:
623
+ """
624
+ Search for content using Firecrawl.
625
+
626
+ Args:
627
+ query (str): Search query string
628
+ limit (Optional[int]): Max results (default: 5)
629
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
630
+ filter (Optional[str]): Custom result filter
631
+ lang (Optional[str]): Language code (default: "en")
632
+ country (Optional[str]): Country code (default: "us")
633
+ location (Optional[str]): Geo-targeting
634
+ timeout (Optional[int]): Request timeout in milliseconds
635
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
636
+ **kwargs: Additional keyword arguments for future compatibility
637
+
638
+ Returns:
639
+ SearchResponse: Response containing:
640
+ * success (bool): Whether request succeeded
641
+ * data (List[FirecrawlDocument]): Search results
642
+ * warning (Optional[str]): Warning message if any
643
+ * error (Optional[str]): Error message if any
644
+
645
+ Raises:
646
+ Exception: If search fails or response cannot be parsed
647
+ """
648
+ # Validate any additional kwargs
649
+ self._validate_kwargs(kwargs, "search")
650
+
651
+ # Build search parameters
652
+ search_params = {}
653
+
654
+ # Add individual parameters
655
+ if limit is not None:
656
+ search_params['limit'] = limit
657
+ if tbs is not None:
658
+ search_params['tbs'] = tbs
659
+ if filter is not None:
660
+ search_params['filter'] = filter
661
+ if lang is not None:
662
+ search_params['lang'] = lang
663
+ if country is not None:
664
+ search_params['country'] = country
665
+ if location is not None:
666
+ search_params['location'] = location
667
+ if timeout is not None:
668
+ search_params['timeout'] = timeout
669
+ if scrape_options is not None:
670
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
671
+
672
+ # Add any additional kwargs
673
+ search_params.update(kwargs)
674
+ _integration = search_params.get('integration')
675
+
676
+ # Create final params object
677
+ final_params = SearchParams(query=query, **search_params)
678
+ params_dict = final_params.dict(exclude_none=True)
679
+ params_dict['origin'] = f"python-sdk@{version}"
680
+
681
+ if _integration:
682
+ params_dict['integration'] = _integration
683
+
684
+ # Make request
685
+ response = requests.post(
686
+ f"{self.api_url}/v1/search",
687
+ headers={"Authorization": f"Bearer {self.api_key}"},
688
+ json=params_dict
689
+ )
690
+
691
+ if response.status_code == 200:
692
+ try:
693
+ response_json = response.json()
694
+ if response_json.get('success') and 'data' in response_json:
695
+ return SearchResponse(**response_json)
696
+ elif "error" in response_json:
697
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
698
+ else:
699
+ raise Exception(f'Search failed. Error: {response_json}')
700
+ except ValueError:
701
+ raise Exception('Failed to parse Firecrawl response as JSON.')
702
+ else:
703
+ self._handle_error(response, 'search')
704
+
705
+ def crawl_url(
706
+ self,
707
+ url: str,
708
+ *,
709
+ include_paths: Optional[List[str]] = None,
710
+ exclude_paths: Optional[List[str]] = None,
711
+ max_depth: Optional[int] = None,
712
+ max_discovery_depth: Optional[int] = None,
713
+ limit: Optional[int] = None,
714
+ allow_backward_links: Optional[bool] = None,
715
+ crawl_entire_domain: Optional[bool] = None,
716
+ allow_external_links: Optional[bool] = None,
717
+ ignore_sitemap: Optional[bool] = None,
718
+ scrape_options: Optional[ScrapeOptions] = None,
719
+ webhook: Optional[Union[str, WebhookConfig]] = None,
720
+ deduplicate_similar_urls: Optional[bool] = None,
721
+ ignore_query_parameters: Optional[bool] = None,
722
+ regex_on_full_url: Optional[bool] = None,
723
+ delay: Optional[int] = None,
724
+ allow_subdomains: Optional[bool] = None,
725
+ max_concurrency: Optional[int] = None,
726
+ zero_data_retention: Optional[bool] = None,
727
+ poll_interval: Optional[int] = 2,
728
+ idempotency_key: Optional[str] = None,
729
+ **kwargs
730
+ ) -> CrawlStatusResponse:
731
+ """
732
+ Crawl a website starting from a URL.
733
+
734
+ Args:
735
+ url (str): Target URL to start crawling from
736
+ include_paths (Optional[List[str]]): Patterns of URLs to include
737
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
738
+ max_depth (Optional[int]): Maximum crawl depth
739
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
740
+ limit (Optional[int]): Maximum pages to crawl
741
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
742
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
743
+ allow_external_links (Optional[bool]): Follow external domain links
744
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
745
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
746
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
747
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
748
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
749
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
750
+ delay (Optional[int]): Delay in seconds between scrapes
751
+ allow_subdomains (Optional[bool]): Follow subdomains
752
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
753
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
754
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
755
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
756
+ **kwargs: Additional parameters to pass to the API
757
+
758
+ Returns:
759
+ CrawlStatusResponse with:
760
+ * Crawling status and progress
761
+ * Crawled page contents
762
+ * Success/error information
763
+
764
+ Raises:
765
+ Exception: If crawl fails
766
+ """
767
+ # Validate any additional kwargs
768
+ self._validate_kwargs(kwargs, "crawl_url")
769
+
770
+ crawl_params = {}
771
+
772
+ # Add individual parameters
773
+ if include_paths is not None:
774
+ crawl_params['includePaths'] = include_paths
775
+ if exclude_paths is not None:
776
+ crawl_params['excludePaths'] = exclude_paths
777
+ if max_depth is not None:
778
+ crawl_params['maxDepth'] = max_depth
779
+ if max_discovery_depth is not None:
780
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
781
+ if limit is not None:
782
+ crawl_params['limit'] = limit
783
+ if crawl_entire_domain is not None:
784
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
785
+ elif allow_backward_links is not None:
786
+ crawl_params['allowBackwardLinks'] = allow_backward_links
787
+ if allow_external_links is not None:
788
+ crawl_params['allowExternalLinks'] = allow_external_links
789
+ if ignore_sitemap is not None:
790
+ crawl_params['ignoreSitemap'] = ignore_sitemap
791
+ if scrape_options is not None:
792
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
793
+ if webhook is not None:
794
+ crawl_params['webhook'] = webhook
795
+ if deduplicate_similar_urls is not None:
796
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
797
+ if ignore_query_parameters is not None:
798
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
799
+ if regex_on_full_url is not None:
800
+ crawl_params['regexOnFullURL'] = regex_on_full_url
801
+ if delay is not None:
802
+ crawl_params['delay'] = delay
803
+ if allow_subdomains is not None:
804
+ crawl_params['allowSubdomains'] = allow_subdomains
805
+ if max_concurrency is not None:
806
+ crawl_params['maxConcurrency'] = max_concurrency
807
+ if zero_data_retention is not None:
808
+ crawl_params['zeroDataRetention'] = zero_data_retention
809
+ # Add any additional kwargs
810
+ crawl_params.update(kwargs)
811
+ _integration = crawl_params.get('integration')
812
+
813
+ # Create final params object
814
+ final_params = CrawlParams(**crawl_params)
815
+ params_dict = final_params.dict(exclude_none=True)
816
+ params_dict['url'] = url
817
+ params_dict['origin'] = f"python-sdk@{version}"
818
+
819
+ if _integration:
820
+ params_dict['integration'] = _integration
821
+
822
+ # Make request
823
+ headers = self._prepare_headers(idempotency_key)
824
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
825
+
826
+ if response.status_code == 200:
827
+ try:
828
+ id = response.json().get('id')
829
+ except:
830
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
831
+ return self._monitor_job_status(id, headers, poll_interval)
832
+ else:
833
+ self._handle_error(response, 'start crawl job')
834
+
835
+ def async_crawl_url(
836
+ self,
837
+ url: str,
838
+ *,
839
+ include_paths: Optional[List[str]] = None,
840
+ exclude_paths: Optional[List[str]] = None,
841
+ max_depth: Optional[int] = None,
842
+ max_discovery_depth: Optional[int] = None,
843
+ limit: Optional[int] = None,
844
+ allow_backward_links: Optional[bool] = None,
845
+ crawl_entire_domain: Optional[bool] = None,
846
+ allow_external_links: Optional[bool] = None,
847
+ ignore_sitemap: Optional[bool] = None,
848
+ scrape_options: Optional[ScrapeOptions] = None,
849
+ webhook: Optional[Union[str, WebhookConfig]] = None,
850
+ deduplicate_similar_urls: Optional[bool] = None,
851
+ ignore_query_parameters: Optional[bool] = None,
852
+ regex_on_full_url: Optional[bool] = None,
853
+ delay: Optional[int] = None,
854
+ allow_subdomains: Optional[bool] = None,
855
+ max_concurrency: Optional[int] = None,
856
+ zero_data_retention: Optional[bool] = None,
857
+ idempotency_key: Optional[str] = None,
858
+ **kwargs
859
+ ) -> CrawlResponse:
860
+ """
861
+ Start an asynchronous crawl job.
862
+
863
+ Args:
864
+ url (str): Target URL to start crawling from
865
+ include_paths (Optional[List[str]]): Patterns of URLs to include
866
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
867
+ max_depth (Optional[int]): Maximum crawl depth
868
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
869
+ limit (Optional[int]): Maximum pages to crawl
870
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
871
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
872
+ allow_external_links (Optional[bool]): Follow external domain links
873
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
874
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
875
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
876
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
877
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
878
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
879
+ delay (Optional[int]): Delay in seconds between scrapes
880
+ allow_subdomains (Optional[bool]): Follow subdomains
881
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
882
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
883
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
884
+ **kwargs: Additional parameters to pass to the API
885
+
886
+ Returns:
887
+ CrawlResponse with:
888
+ * success - Whether crawl started successfully
889
+ * id - Unique identifier for the crawl job
890
+ * url - Status check URL for the crawl
891
+ * error - Error message if start failed
892
+
893
+ Raises:
894
+ Exception: If crawl initiation fails
895
+ """
896
+ # Validate any additional kwargs
897
+ self._validate_kwargs(kwargs, "async_crawl_url")
898
+
899
+ crawl_params = {}
900
+
901
+ # Add individual parameters
902
+ if include_paths is not None:
903
+ crawl_params['includePaths'] = include_paths
904
+ if exclude_paths is not None:
905
+ crawl_params['excludePaths'] = exclude_paths
906
+ if max_depth is not None:
907
+ crawl_params['maxDepth'] = max_depth
908
+ if max_discovery_depth is not None:
909
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
910
+ if limit is not None:
911
+ crawl_params['limit'] = limit
912
+ if crawl_entire_domain is not None:
913
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
914
+ elif allow_backward_links is not None:
915
+ crawl_params['allowBackwardLinks'] = allow_backward_links
916
+ if allow_external_links is not None:
917
+ crawl_params['allowExternalLinks'] = allow_external_links
918
+ if ignore_sitemap is not None:
919
+ crawl_params['ignoreSitemap'] = ignore_sitemap
920
+ if scrape_options is not None:
921
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
922
+ if webhook is not None:
923
+ crawl_params['webhook'] = webhook
924
+ if deduplicate_similar_urls is not None:
925
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
926
+ if ignore_query_parameters is not None:
927
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
928
+ if regex_on_full_url is not None:
929
+ crawl_params['regexOnFullURL'] = regex_on_full_url
930
+ if delay is not None:
931
+ crawl_params['delay'] = delay
932
+ if allow_subdomains is not None:
933
+ crawl_params['allowSubdomains'] = allow_subdomains
934
+ if max_concurrency is not None:
935
+ crawl_params['maxConcurrency'] = max_concurrency
936
+ if zero_data_retention is not None:
937
+ crawl_params['zeroDataRetention'] = zero_data_retention
938
+ # Add any additional kwargs
939
+ crawl_params.update(kwargs)
940
+
941
+ # Create final params object
942
+ final_params = CrawlParams(**crawl_params)
943
+ params_dict = final_params.dict(exclude_none=True)
944
+ params_dict['url'] = url
945
+ params_dict['origin'] = f"python-sdk@{version}"
946
+
947
+ # Make request
948
+ headers = self._prepare_headers(idempotency_key)
949
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
950
+
951
+ if response.status_code == 200:
952
+ try:
953
+ return CrawlResponse(**response.json())
954
+ except:
955
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
956
+ else:
957
+ self._handle_error(response, 'start crawl job')
958
+
959
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
960
+ """
961
+ Check the status and results of a crawl job.
962
+
963
+ Args:
964
+ id: Unique identifier for the crawl job
965
+
966
+ Returns:
967
+ CrawlStatusResponse containing:
968
+
969
+ Status Information:
970
+ * status - Current state (scraping/completed/failed/cancelled)
971
+ * completed - Number of pages crawled
972
+ * total - Total pages to crawl
973
+ * creditsUsed - API credits consumed
974
+ * expiresAt - Data expiration timestamp
975
+
976
+ Results:
977
+ * data - List of crawled documents
978
+ * next - URL for next page of results (if paginated)
979
+ * success - Whether status check succeeded
980
+ * error - Error message if failed
981
+
982
+ Raises:
983
+ Exception: If status check fails
984
+ """
985
+ endpoint = f'/v1/crawl/{id}'
986
+
987
+ headers = self._prepare_headers()
988
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
989
+ if response.status_code == 200:
990
+ try:
991
+ status_data = response.json()
992
+ except:
993
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
994
+ if status_data['status'] == 'completed':
995
+ if 'data' in status_data:
996
+ data = status_data['data']
997
+ while 'next' in status_data:
998
+ if len(status_data['data']) == 0:
999
+ break
1000
+ next_url = status_data.get('next')
1001
+ if not next_url:
1002
+ logger.warning("Expected 'next' URL is missing.")
1003
+ break
1004
+ try:
1005
+ status_response = self._get_request(next_url, headers)
1006
+ if status_response.status_code != 200:
1007
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1008
+ break
1009
+ try:
1010
+ next_data = status_response.json()
1011
+ except:
1012
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1013
+ data.extend(next_data.get('data', []))
1014
+ status_data = next_data
1015
+ except Exception as e:
1016
+ logger.error(f"Error during pagination request: {e}")
1017
+ break
1018
+ status_data['data'] = data
1019
+
1020
+ response = {
1021
+ 'status': status_data.get('status'),
1022
+ 'total': status_data.get('total'),
1023
+ 'completed': status_data.get('completed'),
1024
+ 'creditsUsed': status_data.get('creditsUsed'),
1025
+ 'expiresAt': status_data.get('expiresAt'),
1026
+ 'data': status_data.get('data')
1027
+ }
1028
+
1029
+ if 'error' in status_data:
1030
+ response['error'] = status_data['error']
1031
+
1032
+ if 'next' in status_data:
1033
+ response['next'] = status_data['next']
1034
+
1035
+ return CrawlStatusResponse(
1036
+ success=False if 'error' in status_data else True,
1037
+ **response
1038
+ )
1039
+ else:
1040
+ self._handle_error(response, 'check crawl status')
1041
+
1042
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
1043
+ """
1044
+ Returns information about crawl errors.
1045
+
1046
+ Args:
1047
+ id (str): The ID of the crawl job
1048
+
1049
+ Returns:
1050
+ CrawlErrorsResponse containing:
1051
+ * errors (List[Dict[str, str]]): List of errors with fields:
1052
+ - id (str): Error ID
1053
+ - timestamp (str): When the error occurred
1054
+ - url (str): URL that caused the error
1055
+ - error (str): Error message
1056
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1057
+
1058
+ Raises:
1059
+ Exception: If error check fails
1060
+ """
1061
+ headers = self._prepare_headers()
1062
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1063
+ if response.status_code == 200:
1064
+ try:
1065
+ return CrawlErrorsResponse(**response.json())
1066
+ except:
1067
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1068
+ else:
1069
+ self._handle_error(response, "check crawl errors")
1070
+
1071
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
1072
+ """
1073
+ Cancel an asynchronous crawl job.
1074
+
1075
+ Args:
1076
+ id (str): The ID of the crawl job to cancel
1077
+
1078
+ Returns:
1079
+ Dict[str, Any] containing:
1080
+ * success (bool): Whether cancellation was successful
1081
+ * error (str, optional): Error message if cancellation failed
1082
+
1083
+ Raises:
1084
+ Exception: If cancellation fails
1085
+ """
1086
+ headers = self._prepare_headers()
1087
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1088
+ if response.status_code == 200:
1089
+ try:
1090
+ return response.json()
1091
+ except:
1092
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1093
+ else:
1094
+ self._handle_error(response, "cancel crawl job")
1095
+
1096
+ def crawl_url_and_watch(
1097
+ self,
1098
+ url: str,
1099
+ *,
1100
+ include_paths: Optional[List[str]] = None,
1101
+ exclude_paths: Optional[List[str]] = None,
1102
+ max_depth: Optional[int] = None,
1103
+ max_discovery_depth: Optional[int] = None,
1104
+ limit: Optional[int] = None,
1105
+ allow_backward_links: Optional[bool] = None,
1106
+ crawl_entire_domain: Optional[bool] = None,
1107
+ allow_external_links: Optional[bool] = None,
1108
+ ignore_sitemap: Optional[bool] = None,
1109
+ scrape_options: Optional[ScrapeOptions] = None,
1110
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1111
+ deduplicate_similar_urls: Optional[bool] = None,
1112
+ ignore_query_parameters: Optional[bool] = None,
1113
+ regex_on_full_url: Optional[bool] = None,
1114
+ delay: Optional[int] = None,
1115
+ allow_subdomains: Optional[bool] = None,
1116
+ max_concurrency: Optional[int] = None,
1117
+ zero_data_retention: Optional[bool] = None,
1118
+ idempotency_key: Optional[str] = None,
1119
+ **kwargs
1120
+ ) -> 'CrawlWatcher':
1121
+ """
1122
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1123
+
1124
+ Args:
1125
+ url (str): Target URL to start crawling from
1126
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1127
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1128
+ max_depth (Optional[int]): Maximum crawl depth
1129
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1130
+ limit (Optional[int]): Maximum pages to crawl
1131
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1132
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
1133
+ allow_external_links (Optional[bool]): Follow external domain links
1134
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1135
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1136
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1137
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1138
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1139
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1140
+ delay (Optional[int]): Delay in seconds between scrapes
1141
+ allow_subdomains (Optional[bool]): Follow subdomains
1142
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1143
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1144
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1145
+ **kwargs: Additional parameters to pass to the API
1146
+
1147
+ Returns:
1148
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1149
+
1150
+ Raises:
1151
+ Exception: If crawl job fails to start
1152
+ """
1153
+ crawl_response = self.async_crawl_url(
1154
+ url,
1155
+ include_paths=include_paths,
1156
+ exclude_paths=exclude_paths,
1157
+ max_depth=max_depth,
1158
+ max_discovery_depth=max_discovery_depth,
1159
+ limit=limit,
1160
+ allow_backward_links=allow_backward_links,
1161
+ allow_external_links=allow_external_links,
1162
+ ignore_sitemap=ignore_sitemap,
1163
+ scrape_options=scrape_options,
1164
+ webhook=webhook,
1165
+ deduplicate_similar_urls=deduplicate_similar_urls,
1166
+ ignore_query_parameters=ignore_query_parameters,
1167
+ regex_on_full_url=regex_on_full_url,
1168
+ delay=delay,
1169
+ allow_subdomains=allow_subdomains,
1170
+ max_concurrency=max_concurrency,
1171
+ zero_data_retention=zero_data_retention,
1172
+ idempotency_key=idempotency_key,
1173
+ **kwargs
1174
+ )
1175
+ if crawl_response.success and crawl_response.id:
1176
+ return CrawlWatcher(crawl_response.id, self)
1177
+ else:
1178
+ raise Exception("Crawl job failed to start")
1179
+
1180
+ def map_url(
1181
+ self,
1182
+ url: str,
1183
+ *,
1184
+ search: Optional[str] = None,
1185
+ ignore_sitemap: Optional[bool] = None,
1186
+ include_subdomains: Optional[bool] = None,
1187
+ sitemap_only: Optional[bool] = None,
1188
+ limit: Optional[int] = None,
1189
+ timeout: Optional[int] = None,
1190
+ use_index: Optional[bool] = None,
1191
+ **kwargs) -> MapResponse:
1192
+ """
1193
+ Map and discover links from a URL.
1194
+
1195
+ Args:
1196
+ url (str): Target URL to map
1197
+ search (Optional[str]): Filter pattern for URLs
1198
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1199
+ include_subdomains (Optional[bool]): Include subdomain links
1200
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1201
+ limit (Optional[int]): Maximum URLs to return
1202
+ timeout (Optional[int]): Request timeout in milliseconds
1203
+ **kwargs: Additional parameters to pass to the API
1204
+
1205
+ Returns:
1206
+ MapResponse: Response containing:
1207
+ * success (bool): Whether request succeeded
1208
+ * links (List[str]): Discovered URLs
1209
+ * error (Optional[str]): Error message if any
1210
+
1211
+ Raises:
1212
+ Exception: If mapping fails or response cannot be parsed
1213
+ """
1214
+ # Validate any additional kwargs
1215
+ self._validate_kwargs(kwargs, "map_url")
1216
+
1217
+ # Build map parameters
1218
+ map_params = {}
1219
+
1220
+ # Add individual parameters
1221
+ if search is not None:
1222
+ map_params['search'] = search
1223
+ if ignore_sitemap is not None:
1224
+ map_params['ignoreSitemap'] = ignore_sitemap
1225
+ if include_subdomains is not None:
1226
+ map_params['includeSubdomains'] = include_subdomains
1227
+ if sitemap_only is not None:
1228
+ map_params['sitemapOnly'] = sitemap_only
1229
+ if limit is not None:
1230
+ map_params['limit'] = limit
1231
+ if timeout is not None:
1232
+ map_params['timeout'] = timeout
1233
+ if use_index is not None:
1234
+ map_params['useIndex'] = use_index
1235
+
1236
+ # Add any additional kwargs
1237
+ map_params.update(kwargs)
1238
+ _integration = map_params.get('integration')
1239
+
1240
+ # Create final params object
1241
+ final_params = MapParams(**map_params)
1242
+ params_dict = final_params.dict(exclude_none=True)
1243
+ params_dict['url'] = url
1244
+ params_dict['origin'] = f"python-sdk@{version}"
1245
+
1246
+ if _integration:
1247
+ params_dict['integration'] = _integration
1248
+
1249
+ # Make request
1250
+ response = requests.post(
1251
+ f"{self.api_url}/v1/map",
1252
+ headers={"Authorization": f"Bearer {self.api_key}"},
1253
+ json=params_dict
1254
+ )
1255
+
1256
+ if response.status_code == 200:
1257
+ try:
1258
+ response_json = response.json()
1259
+ if response_json.get('success') and 'links' in response_json:
1260
+ return MapResponse(**response_json)
1261
+ elif "error" in response_json:
1262
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1263
+ else:
1264
+ raise Exception(f'Map failed. Error: {response_json}')
1265
+ except ValueError:
1266
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1267
+ else:
1268
+ self._handle_error(response, 'map')
1269
+
1270
+ def batch_scrape_urls(
1271
+ self,
1272
+ urls: List[str],
1273
+ *,
1274
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1275
+ headers: Optional[Dict[str, str]] = None,
1276
+ include_tags: Optional[List[str]] = None,
1277
+ exclude_tags: Optional[List[str]] = None,
1278
+ only_main_content: Optional[bool] = None,
1279
+ wait_for: Optional[int] = None,
1280
+ timeout: Optional[int] = None,
1281
+ location: Optional[LocationConfig] = None,
1282
+ mobile: Optional[bool] = None,
1283
+ skip_tls_verification: Optional[bool] = None,
1284
+ remove_base64_images: Optional[bool] = None,
1285
+ block_ads: Optional[bool] = None,
1286
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1287
+ extract: Optional[JsonConfig] = None,
1288
+ json_options: Optional[JsonConfig] = None,
1289
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1290
+ agent: Optional[AgentOptions] = None,
1291
+ poll_interval: Optional[int] = 2,
1292
+ max_concurrency: Optional[int] = None,
1293
+ zero_data_retention: Optional[bool] = None,
1294
+ idempotency_key: Optional[str] = None,
1295
+ **kwargs
1296
+ ) -> BatchScrapeStatusResponse:
1297
+ """
1298
+ Batch scrape multiple URLs and monitor until completion.
1299
+
1300
+ Args:
1301
+ urls (List[str]): URLs to scrape
1302
+ formats (Optional[List[Literal]]): Content formats to retrieve
1303
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1304
+ include_tags (Optional[List[str]]): HTML tags to include
1305
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1306
+ only_main_content (Optional[bool]): Extract main content only
1307
+ wait_for (Optional[int]): Wait time in milliseconds
1308
+ timeout (Optional[int]): Request timeout in milliseconds
1309
+ location (Optional[LocationConfig]): Location configuration
1310
+ mobile (Optional[bool]): Use mobile user agent
1311
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1312
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1313
+ block_ads (Optional[bool]): Block advertisements
1314
+ proxy (Optional[Literal]): Proxy type to use
1315
+ extract (Optional[JsonConfig]): Content extraction config
1316
+ json_options (Optional[JsonConfig]): JSON extraction config
1317
+ actions (Optional[List[Union]]): Actions to perform
1318
+ agent (Optional[AgentOptions]): Agent configuration
1319
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1320
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1321
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1322
+ **kwargs: Additional parameters to pass to the API
1323
+
1324
+ Returns:
1325
+ BatchScrapeStatusResponse with:
1326
+ * Scraping status and progress
1327
+ * Scraped content for each URL
1328
+ * Success/error information
1329
+
1330
+ Raises:
1331
+ Exception: If batch scrape fails
1332
+ """
1333
+ # Validate any additional kwargs
1334
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1335
+
1336
+ scrape_params = {}
1337
+
1338
+ # Add individual parameters
1339
+ if formats is not None:
1340
+ scrape_params['formats'] = formats
1341
+ if headers is not None:
1342
+ scrape_params['headers'] = headers
1343
+ if include_tags is not None:
1344
+ scrape_params['includeTags'] = include_tags
1345
+ if exclude_tags is not None:
1346
+ scrape_params['excludeTags'] = exclude_tags
1347
+ if only_main_content is not None:
1348
+ scrape_params['onlyMainContent'] = only_main_content
1349
+ if wait_for is not None:
1350
+ scrape_params['waitFor'] = wait_for
1351
+ if timeout is not None:
1352
+ scrape_params['timeout'] = timeout
1353
+ if location is not None:
1354
+ scrape_params['location'] = location.dict(exclude_none=True)
1355
+ if mobile is not None:
1356
+ scrape_params['mobile'] = mobile
1357
+ if skip_tls_verification is not None:
1358
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1359
+ if remove_base64_images is not None:
1360
+ scrape_params['removeBase64Images'] = remove_base64_images
1361
+ if block_ads is not None:
1362
+ scrape_params['blockAds'] = block_ads
1363
+ if proxy is not None:
1364
+ scrape_params['proxy'] = proxy
1365
+ if extract is not None:
1366
+ extract = self._ensure_schema_dict(extract)
1367
+ if isinstance(extract, dict) and "schema" in extract:
1368
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1369
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1370
+ if json_options is not None:
1371
+ json_options = self._ensure_schema_dict(json_options)
1372
+ if isinstance(json_options, dict) and "schema" in json_options:
1373
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1374
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1375
+ if actions:
1376
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
1377
+ if agent is not None:
1378
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1379
+ if max_concurrency is not None:
1380
+ scrape_params['maxConcurrency'] = max_concurrency
1381
+ if zero_data_retention is not None:
1382
+ scrape_params['zeroDataRetention'] = zero_data_retention
1383
+
1384
+ # Add any additional kwargs
1385
+ scrape_params.update(kwargs)
1386
+
1387
+ # Create final params object
1388
+ final_params = ScrapeParams(**scrape_params)
1389
+ params_dict = final_params.dict(exclude_none=True)
1390
+ params_dict['urls'] = urls
1391
+ params_dict['origin'] = f"python-sdk@{version}"
1392
+
1393
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1394
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1395
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1396
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1397
+
1398
+ # Make request
1399
+ headers = self._prepare_headers(idempotency_key)
1400
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1401
+
1402
+ if response.status_code == 200:
1403
+ try:
1404
+ id = response.json().get('id')
1405
+ except:
1406
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1407
+ return self._monitor_job_status(id, headers, poll_interval)
1408
+ else:
1409
+ self._handle_error(response, 'start batch scrape job')
1410
+
1411
+ def async_batch_scrape_urls(
1412
+ self,
1413
+ urls: List[str],
1414
+ *,
1415
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1416
+ headers: Optional[Dict[str, str]] = None,
1417
+ include_tags: Optional[List[str]] = None,
1418
+ exclude_tags: Optional[List[str]] = None,
1419
+ only_main_content: Optional[bool] = None,
1420
+ wait_for: Optional[int] = None,
1421
+ timeout: Optional[int] = None,
1422
+ location: Optional[LocationConfig] = None,
1423
+ mobile: Optional[bool] = None,
1424
+ skip_tls_verification: Optional[bool] = None,
1425
+ remove_base64_images: Optional[bool] = None,
1426
+ block_ads: Optional[bool] = None,
1427
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1428
+ extract: Optional[JsonConfig] = None,
1429
+ json_options: Optional[JsonConfig] = None,
1430
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1431
+ agent: Optional[AgentOptions] = None,
1432
+ max_concurrency: Optional[int] = None,
1433
+ idempotency_key: Optional[str] = None,
1434
+ zero_data_retention: Optional[bool] = None,
1435
+ **kwargs
1436
+ ) -> BatchScrapeResponse:
1437
+ """
1438
+ Initiate a batch scrape job asynchronously.
1439
+
1440
+ Args:
1441
+ urls (List[str]): URLs to scrape
1442
+ formats (Optional[List[Literal]]): Content formats to retrieve
1443
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1444
+ include_tags (Optional[List[str]]): HTML tags to include
1445
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1446
+ only_main_content (Optional[bool]): Extract main content only
1447
+ wait_for (Optional[int]): Wait time in milliseconds
1448
+ timeout (Optional[int]): Request timeout in milliseconds
1449
+ location (Optional[LocationConfig]): Location configuration
1450
+ mobile (Optional[bool]): Use mobile user agent
1451
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1452
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1453
+ block_ads (Optional[bool]): Block advertisements
1454
+ proxy (Optional[Literal]): Proxy type to use
1455
+ extract (Optional[JsonConfig]): Content extraction config
1456
+ json_options (Optional[JsonConfig]): JSON extraction config
1457
+ actions (Optional[List[Union]]): Actions to perform
1458
+ agent (Optional[AgentOptions]): Agent configuration
1459
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1460
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1461
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1462
+ **kwargs: Additional parameters to pass to the API
1463
+
1464
+ Returns:
1465
+ BatchScrapeResponse with:
1466
+ * success - Whether job started successfully
1467
+ * id - Unique identifier for the job
1468
+ * url - Status check URL
1469
+ * error - Error message if start failed
1470
+
1471
+ Raises:
1472
+ Exception: If job initiation fails
1473
+ """
1474
+ # Validate any additional kwargs
1475
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1476
+
1477
+ scrape_params = {}
1478
+
1479
+ # Add individual parameters
1480
+ if formats is not None:
1481
+ scrape_params['formats'] = formats
1482
+ if headers is not None:
1483
+ scrape_params['headers'] = headers
1484
+ if include_tags is not None:
1485
+ scrape_params['includeTags'] = include_tags
1486
+ if exclude_tags is not None:
1487
+ scrape_params['excludeTags'] = exclude_tags
1488
+ if only_main_content is not None:
1489
+ scrape_params['onlyMainContent'] = only_main_content
1490
+ if wait_for is not None:
1491
+ scrape_params['waitFor'] = wait_for
1492
+ if timeout is not None:
1493
+ scrape_params['timeout'] = timeout
1494
+ if location is not None:
1495
+ scrape_params['location'] = location.dict(exclude_none=True)
1496
+ if mobile is not None:
1497
+ scrape_params['mobile'] = mobile
1498
+ if skip_tls_verification is not None:
1499
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1500
+ if remove_base64_images is not None:
1501
+ scrape_params['removeBase64Images'] = remove_base64_images
1502
+ if block_ads is not None:
1503
+ scrape_params['blockAds'] = block_ads
1504
+ if proxy is not None:
1505
+ scrape_params['proxy'] = proxy
1506
+ if extract is not None:
1507
+ extract = self._ensure_schema_dict(extract)
1508
+ if isinstance(extract, dict) and "schema" in extract:
1509
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1510
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1511
+ if json_options is not None:
1512
+ json_options = self._ensure_schema_dict(json_options)
1513
+ if isinstance(json_options, dict) and "schema" in json_options:
1514
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1515
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1516
+ if actions:
1517
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
1518
+ if agent is not None:
1519
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1520
+ if max_concurrency is not None:
1521
+ scrape_params['maxConcurrency'] = max_concurrency
1522
+ if zero_data_retention is not None:
1523
+ scrape_params['zeroDataRetention'] = zero_data_retention
1524
+
1525
+ # Add any additional kwargs
1526
+ scrape_params.update(kwargs)
1527
+
1528
+ # Create final params object
1529
+ final_params = ScrapeParams(**scrape_params)
1530
+ params_dict = final_params.dict(exclude_none=True)
1531
+ params_dict['urls'] = urls
1532
+ params_dict['origin'] = f"python-sdk@{version}"
1533
+
1534
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1535
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1536
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1537
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1538
+
1539
+ # Make request
1540
+ headers = self._prepare_headers(idempotency_key)
1541
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1542
+
1543
+ if response.status_code == 200:
1544
+ try:
1545
+ return BatchScrapeResponse(**response.json())
1546
+ except:
1547
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1548
+ else:
1549
+ self._handle_error(response, 'start batch scrape job')
1550
+
1551
+ def batch_scrape_urls_and_watch(
1552
+ self,
1553
+ urls: List[str],
1554
+ *,
1555
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1556
+ headers: Optional[Dict[str, str]] = None,
1557
+ include_tags: Optional[List[str]] = None,
1558
+ exclude_tags: Optional[List[str]] = None,
1559
+ only_main_content: Optional[bool] = None,
1560
+ wait_for: Optional[int] = None,
1561
+ timeout: Optional[int] = None,
1562
+ location: Optional[LocationConfig] = None,
1563
+ mobile: Optional[bool] = None,
1564
+ skip_tls_verification: Optional[bool] = None,
1565
+ remove_base64_images: Optional[bool] = None,
1566
+ block_ads: Optional[bool] = None,
1567
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1568
+ extract: Optional[JsonConfig] = None,
1569
+ json_options: Optional[JsonConfig] = None,
1570
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1571
+ agent: Optional[AgentOptions] = None,
1572
+ max_concurrency: Optional[int] = None,
1573
+ zero_data_retention: Optional[bool] = None,
1574
+ idempotency_key: Optional[str] = None,
1575
+ **kwargs
1576
+ ) -> 'CrawlWatcher':
1577
+ """
1578
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1579
+
1580
+ Args:
1581
+ urls (List[str]): URLs to scrape
1582
+ formats (Optional[List[Literal]]): Content formats to retrieve
1583
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1584
+ include_tags (Optional[List[str]]): HTML tags to include
1585
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1586
+ only_main_content (Optional[bool]): Extract main content only
1587
+ wait_for (Optional[int]): Wait time in milliseconds
1588
+ timeout (Optional[int]): Request timeout in milliseconds
1589
+ location (Optional[LocationConfig]): Location configuration
1590
+ mobile (Optional[bool]): Use mobile user agent
1591
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1592
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1593
+ block_ads (Optional[bool]): Block advertisements
1594
+ proxy (Optional[Literal]): Proxy type to use
1595
+ extract (Optional[JsonConfig]): Content extraction config
1596
+ json_options (Optional[JsonConfig]): JSON extraction config
1597
+ actions (Optional[List[Union]]): Actions to perform
1598
+ agent (Optional[AgentOptions]): Agent configuration
1599
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1600
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1601
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1602
+ **kwargs: Additional parameters to pass to the API
1603
+
1604
+ Returns:
1605
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1606
+
1607
+ Raises:
1608
+ Exception: If batch scrape job fails to start
1609
+ """
1610
+ # Validate any additional kwargs
1611
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1612
+
1613
+ scrape_params = {}
1614
+
1615
+ # Add individual parameters
1616
+ if formats is not None:
1617
+ scrape_params['formats'] = formats
1618
+ if headers is not None:
1619
+ scrape_params['headers'] = headers
1620
+ if include_tags is not None:
1621
+ scrape_params['includeTags'] = include_tags
1622
+ if exclude_tags is not None:
1623
+ scrape_params['excludeTags'] = exclude_tags
1624
+ if only_main_content is not None:
1625
+ scrape_params['onlyMainContent'] = only_main_content
1626
+ if wait_for is not None:
1627
+ scrape_params['waitFor'] = wait_for
1628
+ if timeout is not None:
1629
+ scrape_params['timeout'] = timeout
1630
+ if location is not None:
1631
+ scrape_params['location'] = location.dict(exclude_none=True)
1632
+ if mobile is not None:
1633
+ scrape_params['mobile'] = mobile
1634
+ if skip_tls_verification is not None:
1635
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1636
+ if remove_base64_images is not None:
1637
+ scrape_params['removeBase64Images'] = remove_base64_images
1638
+ if block_ads is not None:
1639
+ scrape_params['blockAds'] = block_ads
1640
+ if proxy is not None:
1641
+ scrape_params['proxy'] = proxy
1642
+ if extract is not None:
1643
+ extract = self._ensure_schema_dict(extract)
1644
+ if isinstance(extract, dict) and "schema" in extract:
1645
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1646
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1647
+ if json_options is not None:
1648
+ json_options = self._ensure_schema_dict(json_options)
1649
+ if isinstance(json_options, dict) and "schema" in json_options:
1650
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1651
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1652
+ if actions:
1653
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
1654
+ if agent is not None:
1655
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1656
+ if max_concurrency is not None:
1657
+ scrape_params['maxConcurrency'] = max_concurrency
1658
+ if zero_data_retention is not None:
1659
+ scrape_params['zeroDataRetention'] = zero_data_retention
1660
+
1661
+ # Add any additional kwargs
1662
+ scrape_params.update(kwargs)
1663
+
1664
+ # Create final params object
1665
+ final_params = ScrapeParams(**scrape_params)
1666
+ params_dict = final_params.dict(exclude_none=True)
1667
+ params_dict['urls'] = urls
1668
+ params_dict['origin'] = f"python-sdk@{version}"
1669
+
1670
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1671
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1672
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1673
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1674
+
1675
+ # Make request
1676
+ headers = self._prepare_headers(idempotency_key)
1677
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1678
+
1679
+ if response.status_code == 200:
1680
+ try:
1681
+ crawl_response = BatchScrapeResponse(**response.json())
1682
+ if crawl_response.success and crawl_response.id:
1683
+ return CrawlWatcher(crawl_response.id, self)
1684
+ else:
1685
+ raise Exception("Batch scrape job failed to start")
1686
+ except:
1687
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1688
+ else:
1689
+ self._handle_error(response, 'start batch scrape job')
1690
+
1691
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1692
+ """
1693
+ Check the status of a batch scrape job using the Firecrawl API.
1694
+
1695
+ Args:
1696
+ id (str): The ID of the batch scrape job.
1697
+
1698
+ Returns:
1699
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1700
+
1701
+ Raises:
1702
+ Exception: If the status check request fails.
1703
+ """
1704
+ endpoint = f'/v1/batch/scrape/{id}'
1705
+
1706
+ headers = self._prepare_headers()
1707
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1708
+ if response.status_code == 200:
1709
+ try:
1710
+ status_data = response.json()
1711
+ except:
1712
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1713
+ if status_data['status'] == 'completed':
1714
+ if 'data' in status_data:
1715
+ data = status_data['data']
1716
+ while 'next' in status_data:
1717
+ if len(status_data['data']) == 0:
1718
+ break
1719
+ next_url = status_data.get('next')
1720
+ if not next_url:
1721
+ logger.warning("Expected 'next' URL is missing.")
1722
+ break
1723
+ try:
1724
+ status_response = self._get_request(next_url, headers)
1725
+ if status_response.status_code != 200:
1726
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1727
+ break
1728
+ try:
1729
+ next_data = status_response.json()
1730
+ except:
1731
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1732
+ data.extend(next_data.get('data', []))
1733
+ status_data = next_data
1734
+ except Exception as e:
1735
+ logger.error(f"Error during pagination request: {e}")
1736
+ break
1737
+ status_data['data'] = data
1738
+
1739
+ return BatchScrapeStatusResponse(**{
1740
+ 'success': False if 'error' in status_data else True,
1741
+ 'status': status_data.get('status'),
1742
+ 'total': status_data.get('total'),
1743
+ 'completed': status_data.get('completed'),
1744
+ 'creditsUsed': status_data.get('creditsUsed'),
1745
+ 'expiresAt': status_data.get('expiresAt'),
1746
+ 'data': status_data.get('data'),
1747
+ 'next': status_data.get('next'),
1748
+ 'error': status_data.get('error')
1749
+ })
1750
+ else:
1751
+ self._handle_error(response, 'check batch scrape status')
1752
+
1753
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1754
+ """
1755
+ Returns information about batch scrape errors.
1756
+
1757
+ Args:
1758
+ id (str): The ID of the crawl job.
1759
+
1760
+ Returns:
1761
+ CrawlErrorsResponse containing:
1762
+ * errors (List[Dict[str, str]]): List of errors with fields:
1763
+ * id (str): Error ID
1764
+ * timestamp (str): When the error occurred
1765
+ * url (str): URL that caused the error
1766
+ * error (str): Error message
1767
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1768
+
1769
+ Raises:
1770
+ Exception: If the error check request fails
1771
+ """
1772
+ headers = self._prepare_headers()
1773
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1774
+ if response.status_code == 200:
1775
+ try:
1776
+ return CrawlErrorsResponse(**response.json())
1777
+ except:
1778
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1779
+ else:
1780
+ self._handle_error(response, "check batch scrape errors")
1781
+
1782
+ def extract(
1783
+ self,
1784
+ urls: Optional[List[str]] = None,
1785
+ *,
1786
+ prompt: Optional[str] = None,
1787
+ schema: Optional[Any] = None,
1788
+ system_prompt: Optional[str] = None,
1789
+ allow_external_links: Optional[bool] = False,
1790
+ enable_web_search: Optional[bool] = False,
1791
+ show_sources: Optional[bool] = False,
1792
+ agent: Optional[Dict[str, Any]] = None,
1793
+ **kwargs) -> ExtractResponse[Any]:
1794
+ """
1795
+ Extract structured information from URLs.
1796
+
1797
+ Args:
1798
+ urls (Optional[List[str]]): URLs to extract from
1799
+ prompt (Optional[str]): Custom extraction prompt
1800
+ schema (Optional[Any]): JSON schema/Pydantic model
1801
+ system_prompt (Optional[str]): System context
1802
+ allow_external_links (Optional[bool]): Follow external links
1803
+ enable_web_search (Optional[bool]): Enable web search
1804
+ show_sources (Optional[bool]): Include source URLs
1805
+ agent (Optional[Dict[str, Any]]): Agent configuration
1806
+ **kwargs: Additional parameters to pass to the API
1807
+
1808
+ Returns:
1809
+ ExtractResponse[Any] with:
1810
+ * success (bool): Whether request succeeded
1811
+ * data (Optional[Any]): Extracted data matching schema
1812
+ * error (Optional[str]): Error message if any
1813
+
1814
+ Raises:
1815
+ ValueError: If prompt/schema missing or extraction fails
1816
+ """
1817
+ # Validate any additional kwargs
1818
+ self._validate_kwargs(kwargs, "extract")
1819
+
1820
+ headers = self._prepare_headers()
1821
+
1822
+ if not prompt and not schema:
1823
+ raise ValueError("Either prompt or schema is required")
1824
+
1825
+ if not urls and not prompt:
1826
+ raise ValueError("Either urls or prompt is required")
1827
+
1828
+ if schema:
1829
+ schema = self._ensure_schema_dict(schema)
1830
+
1831
+ request_data = {
1832
+ 'urls': urls or [],
1833
+ 'allowExternalLinks': allow_external_links,
1834
+ 'enableWebSearch': enable_web_search,
1835
+ 'showSources': show_sources,
1836
+ 'schema': schema,
1837
+ 'origin': f'python-sdk@{get_version()}'
1838
+ }
1839
+
1840
+ # Only add prompt and systemPrompt if they exist
1841
+ if prompt:
1842
+ request_data['prompt'] = prompt
1843
+ if system_prompt:
1844
+ request_data['systemPrompt'] = system_prompt
1845
+
1846
+ if agent:
1847
+ request_data['agent'] = agent
1848
+
1849
+ # Add any additional kwargs
1850
+ request_data.update(kwargs)
1851
+
1852
+ try:
1853
+ # Send the initial extract request
1854
+ response = self._post_request(
1855
+ f'{self.api_url}/v1/extract',
1856
+ request_data,
1857
+ headers
1858
+ )
1859
+ if response.status_code == 200:
1860
+ try:
1861
+ data = response.json()
1862
+ except:
1863
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1864
+ if data['success']:
1865
+ job_id = data.get('id')
1866
+ if not job_id:
1867
+ raise Exception('Job ID not returned from extract request.')
1868
+
1869
+ # Poll for the extract status
1870
+ while True:
1871
+ status_response = self._get_request(
1872
+ f'{self.api_url}/v1/extract/{job_id}',
1873
+ headers
1874
+ )
1875
+ if status_response.status_code == 200:
1876
+ try:
1877
+ status_data = status_response.json()
1878
+ except:
1879
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1880
+ if status_data['status'] == 'completed':
1881
+ return ExtractResponse(**status_data)
1882
+ elif status_data['status'] in ['failed', 'cancelled']:
1883
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1884
+ else:
1885
+ self._handle_error(status_response, "extract-status")
1886
+
1887
+ time.sleep(2) # Polling interval
1888
+ else:
1889
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1890
+ else:
1891
+ self._handle_error(response, "extract")
1892
+ except Exception as e:
1893
+ raise ValueError(str(e), 500)
1894
+
1895
+ return ExtractResponse(success=False, error="Internal server error.")
1896
+
1897
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1898
+ """
1899
+ Retrieve the status of an extract job.
1900
+
1901
+ Args:
1902
+ job_id (str): The ID of the extract job.
1903
+
1904
+ Returns:
1905
+ ExtractResponse[Any]: The status of the extract job.
1906
+
1907
+ Raises:
1908
+ ValueError: If there is an error retrieving the status.
1909
+ """
1910
+ headers = self._prepare_headers()
1911
+ try:
1912
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1913
+ if response.status_code == 200:
1914
+ try:
1915
+ return ExtractResponse(**response.json())
1916
+ except:
1917
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1918
+ else:
1919
+ self._handle_error(response, "get extract status")
1920
+ except Exception as e:
1921
+ raise ValueError(str(e), 500)
1922
+
1923
+ def async_extract(
1924
+ self,
1925
+ urls: Optional[List[str]] = None,
1926
+ *,
1927
+ prompt: Optional[str] = None,
1928
+ schema: Optional[Any] = None,
1929
+ system_prompt: Optional[str] = None,
1930
+ allow_external_links: Optional[bool] = False,
1931
+ enable_web_search: Optional[bool] = False,
1932
+ show_sources: Optional[bool] = False,
1933
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1934
+ """
1935
+ Initiate an asynchronous extract job.
1936
+
1937
+ Args:
1938
+ urls (List[str]): URLs to extract information from
1939
+ prompt (Optional[str]): Custom extraction prompt
1940
+ schema (Optional[Any]): JSON schema/Pydantic model
1941
+ system_prompt (Optional[str]): System context
1942
+ allow_external_links (Optional[bool]): Follow external links
1943
+ enable_web_search (Optional[bool]): Enable web search
1944
+ show_sources (Optional[bool]): Include source URLs
1945
+ agent (Optional[Dict[str, Any]]): Agent configuration
1946
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1947
+
1948
+ Returns:
1949
+ ExtractResponse[Any] with:
1950
+ * success (bool): Whether request succeeded
1951
+ * data (Optional[Any]): Extracted data matching schema
1952
+ * error (Optional[str]): Error message if any
1953
+
1954
+ Raises:
1955
+ ValueError: If job initiation fails
1956
+ """
1957
+ headers = self._prepare_headers()
1958
+
1959
+ schema = schema
1960
+ if schema:
1961
+ schema = self._ensure_schema_dict(schema)
1962
+
1963
+ request_data = {
1964
+ 'urls': urls,
1965
+ 'allowExternalLinks': allow_external_links,
1966
+ 'enableWebSearch': enable_web_search,
1967
+ 'showSources': show_sources,
1968
+ 'schema': schema,
1969
+ 'origin': f'python-sdk@{version}'
1970
+ }
1971
+
1972
+ if prompt:
1973
+ request_data['prompt'] = prompt
1974
+ if system_prompt:
1975
+ request_data['systemPrompt'] = system_prompt
1976
+ if agent:
1977
+ request_data['agent'] = agent
1978
+
1979
+ try:
1980
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1981
+ if response.status_code == 200:
1982
+ try:
1983
+ return ExtractResponse(**response.json())
1984
+ except:
1985
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1986
+ else:
1987
+ self._handle_error(response, "async extract")
1988
+ except Exception as e:
1989
+ raise ValueError(str(e), 500)
1990
+
1991
+ def generate_llms_text(
1992
+ self,
1993
+ url: str,
1994
+ *,
1995
+ max_urls: Optional[int] = None,
1996
+ show_full_text: Optional[bool] = None,
1997
+ cache: Optional[bool] = None,
1998
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1999
+ """
2000
+ Generate LLMs.txt for a given URL and poll until completion.
2001
+
2002
+ Args:
2003
+ url (str): Target URL to generate LLMs.txt from
2004
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2005
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2006
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2007
+ experimental_stream (Optional[bool]): Enable experimental streaming
2008
+
2009
+ Returns:
2010
+ GenerateLLMsTextStatusResponse with:
2011
+ * Generated LLMs.txt content
2012
+ * Full version if requested
2013
+ * Generation status
2014
+ * Success/error information
2015
+
2016
+ Raises:
2017
+ Exception: If generation fails
2018
+ """
2019
+ params = GenerateLLMsTextParams(
2020
+ maxUrls=max_urls,
2021
+ showFullText=show_full_text,
2022
+ cache=cache,
2023
+ __experimental_stream=experimental_stream
2024
+ )
2025
+
2026
+ response = self.async_generate_llms_text(
2027
+ url,
2028
+ max_urls=max_urls,
2029
+ show_full_text=show_full_text,
2030
+ cache=cache,
2031
+ experimental_stream=experimental_stream
2032
+ )
2033
+
2034
+ if not response.success or not response.id:
2035
+ return GenerateLLMsTextStatusResponse(
2036
+ success=False,
2037
+ error='Failed to start LLMs.txt generation',
2038
+ status='failed',
2039
+ expiresAt=''
2040
+ )
2041
+
2042
+ job_id = response.id
2043
+ while True:
2044
+ status = self.check_generate_llms_text_status(job_id)
2045
+
2046
+ if status.status == 'completed':
2047
+ return status
2048
+ elif status.status == 'failed':
2049
+ return status
2050
+ elif status.status != 'processing':
2051
+ return GenerateLLMsTextStatusResponse(
2052
+ success=False,
2053
+ error='LLMs.txt generation job terminated unexpectedly',
2054
+ status='failed',
2055
+ expiresAt=''
2056
+ )
2057
+
2058
+ time.sleep(2) # Polling interval
2059
+
2060
+ def async_generate_llms_text(
2061
+ self,
2062
+ url: str,
2063
+ *,
2064
+ max_urls: Optional[int] = None,
2065
+ show_full_text: Optional[bool] = None,
2066
+ cache: Optional[bool] = None,
2067
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
2068
+ """
2069
+ Initiate an asynchronous LLMs.txt generation operation.
2070
+
2071
+ Args:
2072
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2073
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2074
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2075
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2076
+ experimental_stream (Optional[bool]): Enable experimental streaming
2077
+
2078
+ Returns:
2079
+ GenerateLLMsTextResponse: A response containing:
2080
+ * success (bool): Whether the generation initiation was successful
2081
+ * id (str): The unique identifier for the generation job
2082
+ * error (str, optional): Error message if initiation failed
2083
+
2084
+ Raises:
2085
+ Exception: If the generation job initiation fails.
2086
+ """
2087
+ params = GenerateLLMsTextParams(
2088
+ maxUrls=max_urls,
2089
+ showFullText=show_full_text,
2090
+ cache=cache,
2091
+ __experimental_stream=experimental_stream
2092
+ )
2093
+
2094
+ headers = self._prepare_headers()
2095
+ json_data = {'url': url, **params.dict(exclude_none=True)}
2096
+ json_data['origin'] = f"python-sdk@{version}"
2097
+
2098
+ try:
2099
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2100
+ response = req.json()
2101
+ print("json_data", json_data)
2102
+ print("response", response)
2103
+ if response.get('success'):
2104
+ try:
2105
+ return GenerateLLMsTextResponse(**response)
2106
+ except:
2107
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2108
+ else:
2109
+ self._handle_error(response, 'start LLMs.txt generation')
2110
+ except Exception as e:
2111
+ raise ValueError(str(e))
2112
+
2113
+ return GenerateLLMsTextResponse(
2114
+ success=False,
2115
+ error='Internal server error'
2116
+ )
2117
+
2118
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2119
+ """
2120
+ Check the status of a LLMs.txt generation operation.
2121
+
2122
+ Args:
2123
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
2124
+
2125
+ Returns:
2126
+ GenerateLLMsTextStatusResponse: A response containing:
2127
+ * success (bool): Whether the generation was successful
2128
+ * status (str): Status of generation ("processing", "completed", "failed")
2129
+ * data (Dict[str, str], optional): Generated text with fields:
2130
+ * llmstxt (str): Generated LLMs.txt content
2131
+ * llmsfulltxt (str, optional): Full version if requested
2132
+ * error (str, optional): Error message if generation failed
2133
+ * expiresAt (str): When the generated data expires
2134
+
2135
+ Raises:
2136
+ Exception: If the status check fails.
2137
+ """
2138
+ headers = self._prepare_headers()
2139
+ try:
2140
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2141
+ if response.status_code == 200:
2142
+ try:
2143
+ json_data = response.json()
2144
+ return GenerateLLMsTextStatusResponse(**json_data)
2145
+ except Exception as e:
2146
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2147
+ elif response.status_code == 404:
2148
+ raise Exception('LLMs.txt generation job not found')
2149
+ else:
2150
+ self._handle_error(response, 'check LLMs.txt generation status')
2151
+ except Exception as e:
2152
+ raise ValueError(str(e))
2153
+
2154
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2155
+
2156
+ def _prepare_headers(
2157
+ self,
2158
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
2159
+ """
2160
+ Prepare the headers for API requests.
2161
+
2162
+ Args:
2163
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2164
+
2165
+ Returns:
2166
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2167
+ """
2168
+ if idempotency_key:
2169
+ return {
2170
+ 'Content-Type': 'application/json',
2171
+ 'Authorization': f'Bearer {self.api_key}',
2172
+ 'x-idempotency-key': idempotency_key
2173
+ }
2174
+
2175
+ return {
2176
+ 'Content-Type': 'application/json',
2177
+ 'Authorization': f'Bearer {self.api_key}',
2178
+ }
2179
+
2180
+ def _post_request(
2181
+ self,
2182
+ url: str,
2183
+ data: Dict[str, Any],
2184
+ headers: Dict[str, str],
2185
+ retries: int = 3,
2186
+ backoff_factor: float = 0.5) -> requests.Response:
2187
+ """
2188
+ Make a POST request with retries.
2189
+
2190
+ Args:
2191
+ url (str): The URL to send the POST request to.
2192
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2193
+ headers (Dict[str, str]): The headers to include in the POST request.
2194
+ retries (int): Number of retries for the request.
2195
+ backoff_factor (float): Backoff factor for retries.
2196
+
2197
+ Returns:
2198
+ requests.Response: The response from the POST request.
2199
+
2200
+ Raises:
2201
+ requests.RequestException: If the request fails after the specified retries.
2202
+ """
2203
+ for attempt in range(retries):
2204
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2205
+ if response.status_code == 502:
2206
+ time.sleep(backoff_factor * (2 ** attempt))
2207
+ else:
2208
+ return response
2209
+ return response
2210
+
2211
+ def _get_request(
2212
+ self,
2213
+ url: str,
2214
+ headers: Dict[str, str],
2215
+ retries: int = 3,
2216
+ backoff_factor: float = 0.5) -> requests.Response:
2217
+ """
2218
+ Make a GET request with retries.
2219
+
2220
+ Args:
2221
+ url (str): The URL to send the GET request to.
2222
+ headers (Dict[str, str]): The headers to include in the GET request.
2223
+ retries (int): Number of retries for the request.
2224
+ backoff_factor (float): Backoff factor for retries.
2225
+
2226
+ Returns:
2227
+ requests.Response: The response from the GET request.
2228
+
2229
+ Raises:
2230
+ requests.RequestException: If the request fails after the specified retries.
2231
+ """
2232
+ for attempt in range(retries):
2233
+ response = requests.get(url, headers=headers)
2234
+ if response.status_code == 502:
2235
+ time.sleep(backoff_factor * (2 ** attempt))
2236
+ else:
2237
+ return response
2238
+ return response
2239
+
2240
+ def _delete_request(
2241
+ self,
2242
+ url: str,
2243
+ headers: Dict[str, str],
2244
+ retries: int = 3,
2245
+ backoff_factor: float = 0.5) -> requests.Response:
2246
+ """
2247
+ Make a DELETE request with retries.
2248
+
2249
+ Args:
2250
+ url (str): The URL to send the DELETE request to.
2251
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2252
+ retries (int): Number of retries for the request.
2253
+ backoff_factor (float): Backoff factor for retries.
2254
+
2255
+ Returns:
2256
+ requests.Response: The response from the DELETE request.
2257
+
2258
+ Raises:
2259
+ requests.RequestException: If the request fails after the specified retries.
2260
+ """
2261
+ for attempt in range(retries):
2262
+ response = requests.delete(url, headers=headers)
2263
+ if response.status_code == 502:
2264
+ time.sleep(backoff_factor * (2 ** attempt))
2265
+ else:
2266
+ return response
2267
+ return response
2268
+
2269
+ def _monitor_job_status(
2270
+ self,
2271
+ id: str,
2272
+ headers: Dict[str, str],
2273
+ poll_interval: int) -> CrawlStatusResponse:
2274
+ """
2275
+ Monitor the status of a crawl job until completion.
2276
+
2277
+ Args:
2278
+ id (str): The ID of the crawl job.
2279
+ headers (Dict[str, str]): The headers to include in the status check requests.
2280
+ poll_interval (int): Seconds between status checks.
2281
+
2282
+ Returns:
2283
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2284
+
2285
+ Raises:
2286
+ Exception: If the job fails or an error occurs during status checks.
2287
+ """
2288
+ while True:
2289
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2290
+
2291
+ status_response = self._get_request(api_url, headers)
2292
+ if status_response.status_code == 200:
2293
+ try:
2294
+ status_data = status_response.json()
2295
+ except:
2296
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2297
+ if status_data['status'] == 'completed':
2298
+ if 'data' in status_data:
2299
+ data = status_data['data']
2300
+ while 'next' in status_data:
2301
+ if len(status_data['data']) == 0:
2302
+ break
2303
+ status_response = self._get_request(status_data['next'], headers)
2304
+ try:
2305
+ status_data = status_response.json()
2306
+ except:
2307
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2308
+ data.extend(status_data.get('data', []))
2309
+ status_data['data'] = data
2310
+ return CrawlStatusResponse(**status_data)
2311
+ else:
2312
+ raise Exception('Crawl job completed but no data was returned')
2313
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2314
+ poll_interval=max(poll_interval,2)
2315
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2316
+ else:
2317
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2318
+ else:
2319
+ self._handle_error(status_response, 'check crawl status')
2320
+
2321
+ def _handle_error(
2322
+ self,
2323
+ response: requests.Response,
2324
+ action: str) -> None:
2325
+ """
2326
+ Handle errors from API responses.
2327
+
2328
+ Args:
2329
+ response (requests.Response): The response object from the API request.
2330
+ action (str): Description of the action that was being performed.
2331
+
2332
+ Raises:
2333
+ Exception: An exception with a message containing the status code and error details from the response.
2334
+ """
2335
+ try:
2336
+ error_message = response.json().get('error', 'No error message provided.')
2337
+ error_details = response.json().get('details', 'No additional error details provided.')
2338
+ except:
2339
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2340
+
2341
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2342
+
2343
+ # Raise an HTTPError with the custom message and attach the response
2344
+ raise requests.exceptions.HTTPError(message, response=response)
2345
+
2346
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2347
+ """
2348
+ Generate a standardized error message based on HTTP status code.
2349
+
2350
+ Args:
2351
+ status_code (int): The HTTP status code from the response
2352
+ action (str): Description of the action that was being performed
2353
+ error_message (str): The error message from the API response
2354
+ error_details (str): Additional error details from the API response
2355
+
2356
+ Returns:
2357
+ str: A formatted error message
2358
+ """
2359
+ if status_code == 402:
2360
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2361
+ elif status_code == 403:
2362
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2363
+ elif status_code == 408:
2364
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2365
+ elif status_code == 409:
2366
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2367
+ elif status_code == 500:
2368
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2369
+ else:
2370
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2371
+
2372
+ def deep_research(
2373
+ self,
2374
+ query: str,
2375
+ *,
2376
+ max_depth: Optional[int] = None,
2377
+ time_limit: Optional[int] = None,
2378
+ max_urls: Optional[int] = None,
2379
+ analysis_prompt: Optional[str] = None,
2380
+ system_prompt: Optional[str] = None,
2381
+ __experimental_stream_steps: Optional[bool] = None,
2382
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2383
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2384
+ """
2385
+ Initiates a deep research operation on a given query and polls until completion.
2386
+
2387
+ Args:
2388
+ query (str): Research query or topic to investigate
2389
+ max_depth (Optional[int]): Maximum depth of research exploration
2390
+ time_limit (Optional[int]): Time limit in seconds for research
2391
+ max_urls (Optional[int]): Maximum number of URLs to process
2392
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2393
+ system_prompt (Optional[str]): Custom system prompt
2394
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2395
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2396
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2397
+
2398
+ Returns:
2399
+ DeepResearchStatusResponse containing:
2400
+ * success (bool): Whether research completed successfully
2401
+ * status (str): Current state (processing/completed/failed)
2402
+ * error (Optional[str]): Error message if failed
2403
+ * id (str): Unique identifier for the research job
2404
+ * data (Any): Research findings and analysis
2405
+ * sources (List[Dict]): List of discovered sources
2406
+ * activities (List[Dict]): Research progress log
2407
+ * summaries (List[str]): Generated research summaries
2408
+
2409
+ Raises:
2410
+ Exception: If research fails
2411
+ """
2412
+ research_params = {}
2413
+ if max_depth is not None:
2414
+ research_params['maxDepth'] = max_depth
2415
+ if time_limit is not None:
2416
+ research_params['timeLimit'] = time_limit
2417
+ if max_urls is not None:
2418
+ research_params['maxUrls'] = max_urls
2419
+ if analysis_prompt is not None:
2420
+ research_params['analysisPrompt'] = analysis_prompt
2421
+ if system_prompt is not None:
2422
+ research_params['systemPrompt'] = system_prompt
2423
+ if __experimental_stream_steps is not None:
2424
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2425
+ research_params = DeepResearchParams(**research_params)
2426
+
2427
+ response = self.async_deep_research(
2428
+ query,
2429
+ max_depth=max_depth,
2430
+ time_limit=time_limit,
2431
+ max_urls=max_urls,
2432
+ analysis_prompt=analysis_prompt,
2433
+ system_prompt=system_prompt
2434
+ )
2435
+ if not response.get('success') or 'id' not in response:
2436
+ return response
2437
+
2438
+ job_id = response['id']
2439
+ last_activity_count = 0
2440
+ last_source_count = 0
2441
+
2442
+ while True:
2443
+ status = self.check_deep_research_status(job_id)
2444
+
2445
+ if on_activity and 'activities' in status:
2446
+ new_activities = status['activities'][last_activity_count:]
2447
+ for activity in new_activities:
2448
+ on_activity(activity)
2449
+ last_activity_count = len(status['activities'])
2450
+
2451
+ if on_source and 'sources' in status:
2452
+ new_sources = status['sources'][last_source_count:]
2453
+ for source in new_sources:
2454
+ on_source(source)
2455
+ last_source_count = len(status['sources'])
2456
+
2457
+ if status['status'] == 'completed':
2458
+ return status
2459
+ elif status['status'] == 'failed':
2460
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2461
+ elif status['status'] != 'processing':
2462
+ break
2463
+
2464
+ time.sleep(2) # Polling interval
2465
+
2466
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2467
+
2468
+ def async_deep_research(
2469
+ self,
2470
+ query: str,
2471
+ *,
2472
+ max_depth: Optional[int] = None,
2473
+ time_limit: Optional[int] = None,
2474
+ max_urls: Optional[int] = None,
2475
+ analysis_prompt: Optional[str] = None,
2476
+ system_prompt: Optional[str] = None,
2477
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2478
+ """
2479
+ Initiates an asynchronous deep research operation.
2480
+
2481
+ Args:
2482
+ query (str): Research query or topic to investigate
2483
+ max_depth (Optional[int]): Maximum depth of research exploration
2484
+ time_limit (Optional[int]): Time limit in seconds for research
2485
+ max_urls (Optional[int]): Maximum number of URLs to process
2486
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2487
+ system_prompt (Optional[str]): Custom system prompt
2488
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2489
+
2490
+ Returns:
2491
+ Dict[str, Any]: A response containing:
2492
+ * success (bool): Whether the research initiation was successful
2493
+ * id (str): The unique identifier for the research job
2494
+ * error (str, optional): Error message if initiation failed
2495
+
2496
+ Raises:
2497
+ Exception: If the research initiation fails.
2498
+ """
2499
+ research_params = {}
2500
+ if max_depth is not None:
2501
+ research_params['maxDepth'] = max_depth
2502
+ if time_limit is not None:
2503
+ research_params['timeLimit'] = time_limit
2504
+ if max_urls is not None:
2505
+ research_params['maxUrls'] = max_urls
2506
+ if analysis_prompt is not None:
2507
+ research_params['analysisPrompt'] = analysis_prompt
2508
+ if system_prompt is not None:
2509
+ research_params['systemPrompt'] = system_prompt
2510
+ if __experimental_stream_steps is not None:
2511
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2512
+ research_params = DeepResearchParams(**research_params)
2513
+
2514
+ headers = self._prepare_headers()
2515
+
2516
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2517
+ json_data['origin'] = f"python-sdk@{version}"
2518
+
2519
+ # Handle json options schema if present
2520
+ if 'jsonOptions' in json_data:
2521
+ json_opts = json_data['jsonOptions']
2522
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2523
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2524
+
2525
+ try:
2526
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2527
+ if response.status_code == 200:
2528
+ try:
2529
+ return response.json()
2530
+ except:
2531
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2532
+ else:
2533
+ self._handle_error(response, 'start deep research')
2534
+ except Exception as e:
2535
+ raise ValueError(str(e))
2536
+
2537
+ return {'success': False, 'error': 'Internal server error'}
2538
+
2539
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2540
+ """
2541
+ Check the status of a deep research operation.
2542
+
2543
+ Args:
2544
+ id (str): The ID of the deep research operation.
2545
+
2546
+ Returns:
2547
+ DeepResearchResponse containing:
2548
+
2549
+ Status:
2550
+ * success - Whether research completed successfully
2551
+ * status - Current state (processing/completed/failed)
2552
+ * error - Error message if failed
2553
+
2554
+ Results:
2555
+ * id - Unique identifier for the research job
2556
+ * data - Research findings and analysis
2557
+ * sources - List of discovered sources
2558
+ * activities - Research progress log
2559
+ * summaries - Generated research summaries
2560
+
2561
+ Raises:
2562
+ Exception: If the status check fails.
2563
+ """
2564
+ headers = self._prepare_headers()
2565
+ try:
2566
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2567
+ if response.status_code == 200:
2568
+ try:
2569
+ return response.json()
2570
+ except:
2571
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2572
+ elif response.status_code == 404:
2573
+ raise Exception('Deep research job not found')
2574
+ else:
2575
+ self._handle_error(response, 'check deep research status')
2576
+ except Exception as e:
2577
+ raise ValueError(str(e))
2578
+
2579
+ return {'success': False, 'error': 'Internal server error'}
2580
+
2581
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2582
+ """
2583
+ Validate additional keyword arguments before they are passed to the API.
2584
+ This provides early validation before the Pydantic model validation.
2585
+
2586
+ Args:
2587
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2588
+ method_name (str): Name of the method these kwargs are for
2589
+
2590
+ Raises:
2591
+ ValueError: If kwargs contain invalid or unsupported parameters
2592
+ """
2593
+ if not kwargs:
2594
+ return
2595
+
2596
+ # Known parameter mappings for each method
2597
+ method_params = {
2598
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2599
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2600
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "integration"},
2601
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
2602
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2603
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2604
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url", "integration"},
2605
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout", "integration"},
2606
+ "extract": {"prompt", "schema", "system_prompt", "allow_external_links", "enable_web_search", "show_sources", "agent", "integration"},
2607
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2608
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2609
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2610
+ "actions", "agent", "webhook"},
2611
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2612
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2613
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2614
+ "actions", "agent", "webhook"},
2615
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2616
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2617
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2618
+ "actions", "agent", "webhook"}
2619
+ }
2620
+
2621
+ # Get allowed parameters for this method
2622
+ allowed_params = method_params.get(method_name, set())
2623
+
2624
+ # Check for unknown parameters
2625
+ unknown_params = set(kwargs.keys()) - allowed_params
2626
+ if unknown_params:
2627
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2628
+
2629
+ # Additional type validation can be added here if needed
2630
+ # For now, we rely on Pydantic models for detailed type validation
2631
+
2632
+ def _ensure_schema_dict(self, schema):
2633
+ """
2634
+ Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2635
+ """
2636
+ if schema is None:
2637
+ return schema
2638
+ if isinstance(schema, type):
2639
+ # Pydantic v1/v2 model class
2640
+ if hasattr(schema, 'model_json_schema'):
2641
+ return schema.model_json_schema()
2642
+ elif hasattr(schema, 'schema'):
2643
+ return schema.schema()
2644
+ if isinstance(schema, dict):
2645
+ return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2646
+ if isinstance(schema, (list, tuple)):
2647
+ return [self._ensure_schema_dict(v) for v in schema]
2648
+ return schema
2649
+
2650
+ class CrawlWatcher:
2651
+ """
2652
+ A class to watch and handle crawl job events via WebSocket connection.
2653
+
2654
+ Attributes:
2655
+ id (str): The ID of the crawl job to watch
2656
+ app (FirecrawlApp): The FirecrawlApp instance
2657
+ data (List[Dict[str, Any]]): List of crawled documents/data
2658
+ status (str): Current status of the crawl job
2659
+ ws_url (str): WebSocket URL for the crawl job
2660
+ event_handlers (dict): Dictionary of event type to list of handler functions
2661
+ """
2662
+ def __init__(self, id: str, app: FirecrawlApp):
2663
+ self.id = id
2664
+ self.app = app
2665
+ self.data: List[Dict[str, Any]] = []
2666
+ self.status = "scraping"
2667
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2668
+ self.event_handlers = {
2669
+ 'done': [],
2670
+ 'error': [],
2671
+ 'document': []
2672
+ }
2673
+
2674
+ async def connect(self) -> None:
2675
+ """
2676
+ Establishes WebSocket connection and starts listening for messages.
2677
+ """
2678
+ async with websockets.connect(
2679
+ self.ws_url,
2680
+ max_size=None,
2681
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2682
+ ) as websocket:
2683
+ await self._listen(websocket)
2684
+
2685
+ async def _listen(self, websocket) -> None:
2686
+ """
2687
+ Listens for incoming WebSocket messages and handles them.
2688
+
2689
+ Args:
2690
+ websocket: The WebSocket connection object
2691
+ """
2692
+ async for message in websocket:
2693
+ msg = json.loads(message)
2694
+ await self._handle_message(msg)
2695
+
2696
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2697
+ """
2698
+ Adds an event handler function for a specific event type.
2699
+
2700
+ Args:
2701
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2702
+ handler (Callable): Function to handle the event
2703
+ """
2704
+ if event_type in self.event_handlers:
2705
+ self.event_handlers[event_type].append(handler)
2706
+
2707
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2708
+ """
2709
+ Dispatches an event to all registered handlers for that event type.
2710
+
2711
+ Args:
2712
+ event_type (str): Type of event to dispatch
2713
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2714
+ """
2715
+ if event_type in self.event_handlers:
2716
+ for handler in self.event_handlers[event_type]:
2717
+ handler(detail)
2718
+
2719
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2720
+ """
2721
+ Handles incoming WebSocket messages based on their type.
2722
+
2723
+ Args:
2724
+ msg (Dict[str, Any]): The message to handle
2725
+ """
2726
+ if msg['type'] == 'done':
2727
+ self.status = 'completed'
2728
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2729
+ elif msg['type'] == 'error':
2730
+ self.status = 'failed'
2731
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2732
+ elif msg['type'] == 'catchup':
2733
+ self.status = msg['data']['status']
2734
+ self.data.extend(msg['data'].get('data', []))
2735
+ for doc in self.data:
2736
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2737
+ elif msg['type'] == 'document':
2738
+ self.data.append(msg['data'])
2739
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2740
+
2741
+ class AsyncFirecrawlApp(FirecrawlApp):
2742
+ """
2743
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2744
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2745
+ """
2746
+
2747
+ async def _async_request(
2748
+ self,
2749
+ method: str,
2750
+ url: str,
2751
+ headers: Dict[str, str],
2752
+ data: Optional[Dict[str, Any]] = None,
2753
+ retries: int = 3,
2754
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2755
+ """
2756
+ Generic async request method with exponential backoff retry logic.
2757
+
2758
+ Args:
2759
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2760
+ url (str): The URL to send the request to.
2761
+ headers (Dict[str, str]): Headers to include in the request.
2762
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2763
+ retries (int): Maximum number of retry attempts (default: 3).
2764
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2765
+ Delay will be backoff_factor * (2 ** retry_count).
2766
+
2767
+ Returns:
2768
+ Dict[str, Any]: The parsed JSON response from the server.
2769
+
2770
+ Raises:
2771
+ aiohttp.ClientError: If the request fails after all retries.
2772
+ Exception: If max retries are exceeded or other errors occur.
2773
+ """
2774
+ async with aiohttp.ClientSession() as session:
2775
+ for attempt in range(retries):
2776
+ try:
2777
+ async with session.request(
2778
+ method=method, url=url, headers=headers, json=data
2779
+ ) as response:
2780
+ if response.status == 502:
2781
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2782
+ continue
2783
+ if response.status >= 300:
2784
+ await self._handle_error(response, f"make {method} request")
2785
+ return await response.json()
2786
+ except aiohttp.ClientError as e:
2787
+ if attempt == retries - 1:
2788
+ raise e
2789
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2790
+ raise Exception("Max retries exceeded")
2791
+
2792
+ async def _async_post_request(
2793
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2794
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2795
+ """
2796
+ Make an async POST request with exponential backoff retry logic.
2797
+
2798
+ Args:
2799
+ url (str): The URL to send the POST request to.
2800
+ data (Dict[str, Any]): The JSON data to include in the request body.
2801
+ headers (Dict[str, str]): Headers to include in the request.
2802
+ retries (int): Maximum number of retry attempts (default: 3).
2803
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2804
+ Delay will be backoff_factor * (2 ** retry_count).
2805
+
2806
+ Returns:
2807
+ Dict[str, Any]: The parsed JSON response from the server.
2808
+
2809
+ Raises:
2810
+ aiohttp.ClientError: If the request fails after all retries.
2811
+ Exception: If max retries are exceeded or other errors occur.
2812
+ """
2813
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2814
+
2815
+ async def _async_get_request(
2816
+ self, url: str, headers: Dict[str, str],
2817
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2818
+ """
2819
+ Make an async GET request with exponential backoff retry logic.
2820
+
2821
+ Args:
2822
+ url (str): The URL to send the GET request to.
2823
+ headers (Dict[str, str]): Headers to include in the request.
2824
+ retries (int): Maximum number of retry attempts (default: 3).
2825
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2826
+ Delay will be backoff_factor * (2 ** retry_count).
2827
+
2828
+ Returns:
2829
+ Dict[str, Any]: The parsed JSON response from the server.
2830
+
2831
+ Raises:
2832
+ aiohttp.ClientError: If the request fails after all retries.
2833
+ Exception: If max retries are exceeded or other errors occur.
2834
+ """
2835
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2836
+
2837
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2838
+ """
2839
+ Handle errors from async API responses with detailed error messages.
2840
+
2841
+ Args:
2842
+ response (aiohttp.ClientResponse): The response object from the failed request
2843
+ action (str): Description of the action that was being attempted
2844
+
2845
+ Raises:
2846
+ aiohttp.ClientError: With a detailed error message based on the response status:
2847
+ - 402: Payment Required
2848
+ - 408: Request Timeout
2849
+ - 409: Conflict
2850
+ - 500: Internal Server Error
2851
+ - Other: Unexpected error with status code
2852
+ """
2853
+ try:
2854
+ error_data = await response.json()
2855
+ error_message = error_data.get('error', 'No error message provided.')
2856
+ error_details = error_data.get('details', 'No additional error details provided.')
2857
+ except:
2858
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2859
+
2860
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2861
+
2862
+ raise aiohttp.ClientError(message)
2863
+
2864
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2865
+ """
2866
+ Generate a standardized error message based on HTTP status code for async operations.
2867
+
2868
+ Args:
2869
+ status_code (int): The HTTP status code from the response
2870
+ action (str): Description of the action that was being performed
2871
+ error_message (str): The error message from the API response
2872
+ error_details (str): Additional error details from the API response
2873
+
2874
+ Returns:
2875
+ str: A formatted error message
2876
+ """
2877
+ return self._get_error_message(status_code, action, error_message, error_details)
2878
+
2879
+ async def crawl_url_and_watch(
2880
+ self,
2881
+ url: str,
2882
+ params: Optional[CrawlParams] = None,
2883
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2884
+ """
2885
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2886
+
2887
+ Args:
2888
+ url (str): Target URL to start crawling from
2889
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2890
+ URL Discovery:
2891
+ * includePaths - Patterns of URLs to include
2892
+ * excludePaths - Patterns of URLs to exclude
2893
+ * maxDepth - Maximum crawl depth
2894
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2895
+ * limit - Maximum pages to crawl
2896
+
2897
+ Link Following:
2898
+ * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2899
+ * crawlEntireDomain - Follow parent directory links
2900
+ * allowExternalLinks - Follow external domain links
2901
+ * ignoreSitemap - Skip sitemap.xml processing
2902
+
2903
+ Advanced:
2904
+ * scrapeOptions - Page scraping configuration
2905
+ * webhook - Notification webhook settings
2906
+ * deduplicateSimilarURLs - Remove similar URLs
2907
+ * ignoreQueryParameters - Ignore URL parameters
2908
+ * regexOnFullURL - Apply regex to full URLs
2909
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2910
+
2911
+ Returns:
2912
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2913
+
2914
+ Raises:
2915
+ Exception: If crawl job fails to start
2916
+ """
2917
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2918
+ if crawl_response.get('success') and 'id' in crawl_response:
2919
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2920
+ else:
2921
+ raise Exception("Crawl job failed to start")
2922
+
2923
+ async def batch_scrape_urls_and_watch(
2924
+ self,
2925
+ urls: List[str],
2926
+ params: Optional[ScrapeParams] = None,
2927
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2928
+ """
2929
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2930
+
2931
+ Args:
2932
+ urls (List[str]): List of URLs to scrape
2933
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2934
+
2935
+ Content Options:
2936
+ * formats - Content formats to retrieve
2937
+ * includeTags - HTML tags to include
2938
+ * excludeTags - HTML tags to exclude
2939
+ * onlyMainContent - Extract main content only
2940
+
2941
+ Request Options:
2942
+ * headers - Custom HTTP headers
2943
+ * timeout - Request timeout (ms)
2944
+ * mobile - Use mobile user agent
2945
+ * proxy - Proxy type
2946
+
2947
+ Extraction Options:
2948
+ * extract - Content extraction config
2949
+ * jsonOptions - JSON extraction config
2950
+ * actions - Actions to perform
2951
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2952
+
2953
+ Returns:
2954
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2955
+
2956
+ Raises:
2957
+ Exception: If batch scrape job fails to start
2958
+ """
2959
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2960
+ if batch_response.get('success') and 'id' in batch_response:
2961
+ return AsyncCrawlWatcher(batch_response['id'], self)
2962
+ else:
2963
+ raise Exception("Batch scrape job failed to start")
2964
+
2965
+ async def scrape_url(
2966
+ self,
2967
+ url: str,
2968
+ *,
2969
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2970
+ headers: Optional[Dict[str, str]] = None,
2971
+ include_tags: Optional[List[str]] = None,
2972
+ exclude_tags: Optional[List[str]] = None,
2973
+ only_main_content: Optional[bool] = None,
2974
+ wait_for: Optional[int] = None,
2975
+ timeout: Optional[int] = None,
2976
+ location: Optional[LocationConfig] = None,
2977
+ mobile: Optional[bool] = None,
2978
+ skip_tls_verification: Optional[bool] = None,
2979
+ remove_base64_images: Optional[bool] = None,
2980
+ block_ads: Optional[bool] = None,
2981
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2982
+ parse_pdf: Optional[bool] = None,
2983
+ extract: Optional[JsonConfig] = None,
2984
+ json_options: Optional[JsonConfig] = None,
2985
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
2986
+ **kwargs) -> ScrapeResponse[Any]:
2987
+ """
2988
+ Scrape a single URL asynchronously.
2989
+
2990
+ Args:
2991
+ url (str): Target URL to scrape
2992
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2993
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2994
+ include_tags (Optional[List[str]]): HTML tags to include
2995
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2996
+ only_main_content (Optional[bool]): Extract main content only
2997
+ wait_for (Optional[int]): Wait for a specific element to appear
2998
+ timeout (Optional[int]): Request timeout (ms)
2999
+ location (Optional[LocationConfig]): Location configuration
3000
+ mobile (Optional[bool]): Use mobile user agent
3001
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3002
+ remove_base64_images (Optional[bool]): Remove base64 images
3003
+ block_ads (Optional[bool]): Block ads
3004
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
3005
+ extract (Optional[JsonConfig]): Content extraction settings
3006
+ json_options (Optional[JsonConfig]): JSON extraction settings
3007
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
3008
+ **kwargs: Additional parameters to pass to the API
3009
+
3010
+ Returns:
3011
+ ScrapeResponse with:
3012
+ * success - Whether scrape was successful
3013
+ * markdown - Markdown content if requested
3014
+ * html - HTML content if requested
3015
+ * rawHtml - Raw HTML content if requested
3016
+ * links - Extracted links if requested
3017
+ * screenshot - Screenshot if requested
3018
+ * extract - Extracted data if requested
3019
+ * json - JSON data if requested
3020
+ * error - Error message if scrape failed
3021
+
3022
+ Raises:
3023
+ Exception: If scraping fails
3024
+ """
3025
+ # Validate any additional kwargs
3026
+ self._validate_kwargs(kwargs, "scrape_url")
3027
+
3028
+ _headers = self._prepare_headers()
3029
+
3030
+ # Build scrape parameters
3031
+ scrape_params = {
3032
+ 'url': url,
3033
+ 'origin': f"python-sdk@{version}"
3034
+ }
3035
+
3036
+ # Add optional parameters if provided and not None
3037
+ if formats:
3038
+ scrape_params['formats'] = formats
3039
+ if headers:
3040
+ scrape_params['headers'] = headers
3041
+ if include_tags:
3042
+ scrape_params['includeTags'] = include_tags
3043
+ if exclude_tags:
3044
+ scrape_params['excludeTags'] = exclude_tags
3045
+ if only_main_content is not None:
3046
+ scrape_params['onlyMainContent'] = only_main_content
3047
+ if wait_for:
3048
+ scrape_params['waitFor'] = wait_for
3049
+ if timeout:
3050
+ scrape_params['timeout'] = timeout
3051
+ if location:
3052
+ scrape_params['location'] = location.dict(exclude_none=True)
3053
+ if mobile is not None:
3054
+ scrape_params['mobile'] = mobile
3055
+ if skip_tls_verification is not None:
3056
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3057
+ if remove_base64_images is not None:
3058
+ scrape_params['removeBase64Images'] = remove_base64_images
3059
+ if block_ads is not None:
3060
+ scrape_params['blockAds'] = block_ads
3061
+ if proxy:
3062
+ scrape_params['proxy'] = proxy
3063
+ if parse_pdf is not None:
3064
+ scrape_params['parsePDF'] = parse_pdf
3065
+ if extract is not None:
3066
+ extract = self._ensure_schema_dict(extract)
3067
+ if isinstance(extract, dict) and "schema" in extract:
3068
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3069
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3070
+ if json_options is not None:
3071
+ json_options = self._ensure_schema_dict(json_options)
3072
+ if isinstance(json_options, dict) and "schema" in json_options:
3073
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3074
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3075
+ if actions:
3076
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
3077
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
3078
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
3079
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3080
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3081
+
3082
+ # Make async request
3083
+ endpoint = f'/v1/scrape'
3084
+ response = await self._async_post_request(
3085
+ f'{self.api_url}{endpoint}',
3086
+ scrape_params,
3087
+ _headers
3088
+ )
3089
+
3090
+ if response.get('success') and 'data' in response:
3091
+ return ScrapeResponse(**response['data'])
3092
+ elif "error" in response:
3093
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3094
+ else:
3095
+ # Use the response content directly if possible, otherwise a generic message
3096
+ error_content = response.get('error', str(response))
3097
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
3098
+
3099
+ async def batch_scrape_urls(
3100
+ self,
3101
+ urls: List[str],
3102
+ *,
3103
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3104
+ headers: Optional[Dict[str, str]] = None,
3105
+ include_tags: Optional[List[str]] = None,
3106
+ exclude_tags: Optional[List[str]] = None,
3107
+ only_main_content: Optional[bool] = None,
3108
+ wait_for: Optional[int] = None,
3109
+ timeout: Optional[int] = None,
3110
+ location: Optional[LocationConfig] = None,
3111
+ mobile: Optional[bool] = None,
3112
+ skip_tls_verification: Optional[bool] = None,
3113
+ remove_base64_images: Optional[bool] = None,
3114
+ block_ads: Optional[bool] = None,
3115
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3116
+ extract: Optional[JsonConfig] = None,
3117
+ json_options: Optional[JsonConfig] = None,
3118
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3119
+ agent: Optional[AgentOptions] = None,
3120
+ poll_interval: Optional[int] = 2,
3121
+ idempotency_key: Optional[str] = None,
3122
+ **kwargs
3123
+ ) -> BatchScrapeStatusResponse:
3124
+ """
3125
+ Asynchronously scrape multiple URLs and monitor until completion.
3126
+
3127
+ Args:
3128
+ urls (List[str]): URLs to scrape
3129
+ formats (Optional[List[Literal]]): Content formats to retrieve
3130
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3131
+ include_tags (Optional[List[str]]): HTML tags to include
3132
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3133
+ only_main_content (Optional[bool]): Extract main content only
3134
+ wait_for (Optional[int]): Wait time in milliseconds
3135
+ timeout (Optional[int]): Request timeout in milliseconds
3136
+ location (Optional[LocationConfig]): Location configuration
3137
+ mobile (Optional[bool]): Use mobile user agent
3138
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3139
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3140
+ block_ads (Optional[bool]): Block advertisements
3141
+ proxy (Optional[Literal]): Proxy type to use
3142
+ extract (Optional[JsonConfig]): Content extraction config
3143
+ json_options (Optional[JsonConfig]): JSON extraction config
3144
+ actions (Optional[List[Union]]): Actions to perform
3145
+ agent (Optional[AgentOptions]): Agent configuration
3146
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3147
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3148
+ **kwargs: Additional parameters to pass to the API
3149
+
3150
+ Returns:
3151
+ BatchScrapeStatusResponse with:
3152
+ * Scraping status and progress
3153
+ * Scraped content for each URL
3154
+ * Success/error information
3155
+
3156
+ Raises:
3157
+ Exception: If batch scrape fails
3158
+ """
3159
+ # Validate any additional kwargs
3160
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
3161
+
3162
+ scrape_params = {}
3163
+
3164
+ # Add individual parameters
3165
+ if formats is not None:
3166
+ scrape_params['formats'] = formats
3167
+ if headers is not None:
3168
+ scrape_params['headers'] = headers
3169
+ if include_tags is not None:
3170
+ scrape_params['includeTags'] = include_tags
3171
+ if exclude_tags is not None:
3172
+ scrape_params['excludeTags'] = exclude_tags
3173
+ if only_main_content is not None:
3174
+ scrape_params['onlyMainContent'] = only_main_content
3175
+ if wait_for is not None:
3176
+ scrape_params['waitFor'] = wait_for
3177
+ if timeout is not None:
3178
+ scrape_params['timeout'] = timeout
3179
+ if location is not None:
3180
+ scrape_params['location'] = location.dict(exclude_none=True)
3181
+ if mobile is not None:
3182
+ scrape_params['mobile'] = mobile
3183
+ if skip_tls_verification is not None:
3184
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3185
+ if remove_base64_images is not None:
3186
+ scrape_params['removeBase64Images'] = remove_base64_images
3187
+ if block_ads is not None:
3188
+ scrape_params['blockAds'] = block_ads
3189
+ if proxy is not None:
3190
+ scrape_params['proxy'] = proxy
3191
+ if extract is not None:
3192
+ extract = self._ensure_schema_dict(extract)
3193
+ if isinstance(extract, dict) and "schema" in extract:
3194
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3195
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3196
+ if json_options is not None:
3197
+ json_options = self._ensure_schema_dict(json_options)
3198
+ if isinstance(json_options, dict) and "schema" in json_options:
3199
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3200
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3201
+
3202
+ if agent is not None:
3203
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3204
+
3205
+ # Add any additional kwargs
3206
+ scrape_params.update(kwargs)
3207
+
3208
+ # Create final params object
3209
+ final_params = ScrapeParams(**scrape_params)
3210
+ params_dict = final_params.dict(exclude_none=True)
3211
+ params_dict['urls'] = urls
3212
+ params_dict['origin'] = f"python-sdk@{version}"
3213
+
3214
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3215
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3216
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3217
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3218
+
3219
+ # Make request
3220
+ headers = self._prepare_headers(idempotency_key)
3221
+ response = await self._async_post_request(
3222
+ f'{self.api_url}/v1/batch/scrape',
3223
+ params_dict,
3224
+ headers
3225
+ )
3226
+
3227
+ if response.get('success'):
3228
+ try:
3229
+ id = response.get('id')
3230
+ except:
3231
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3232
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3233
+ else:
3234
+ self._handle_error(response, 'start batch scrape job')
3235
+
3236
+
3237
+ async def async_batch_scrape_urls(
3238
+ self,
3239
+ urls: List[str],
3240
+ *,
3241
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3242
+ headers: Optional[Dict[str, str]] = None,
3243
+ include_tags: Optional[List[str]] = None,
3244
+ exclude_tags: Optional[List[str]] = None,
3245
+ only_main_content: Optional[bool] = None,
3246
+ wait_for: Optional[int] = None,
3247
+ timeout: Optional[int] = None,
3248
+ location: Optional[LocationConfig] = None,
3249
+ mobile: Optional[bool] = None,
3250
+ skip_tls_verification: Optional[bool] = None,
3251
+ remove_base64_images: Optional[bool] = None,
3252
+ block_ads: Optional[bool] = None,
3253
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3254
+ extract: Optional[JsonConfig] = None,
3255
+ json_options: Optional[JsonConfig] = None,
3256
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3257
+ agent: Optional[AgentOptions] = None,
3258
+ zero_data_retention: Optional[bool] = None,
3259
+ idempotency_key: Optional[str] = None,
3260
+ **kwargs
3261
+ ) -> BatchScrapeResponse:
3262
+ """
3263
+ Initiate a batch scrape job asynchronously.
3264
+
3265
+ Args:
3266
+ urls (List[str]): URLs to scrape
3267
+ formats (Optional[List[Literal]]): Content formats to retrieve
3268
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3269
+ include_tags (Optional[List[str]]): HTML tags to include
3270
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3271
+ only_main_content (Optional[bool]): Extract main content only
3272
+ wait_for (Optional[int]): Wait time in milliseconds
3273
+ timeout (Optional[int]): Request timeout in milliseconds
3274
+ location (Optional[LocationConfig]): Location configuration
3275
+ mobile (Optional[bool]): Use mobile user agent
3276
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3277
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3278
+ block_ads (Optional[bool]): Block advertisements
3279
+ proxy (Optional[Literal]): Proxy type to use
3280
+ extract (Optional[JsonConfig]): Content extraction config
3281
+ json_options (Optional[JsonConfig]): JSON extraction config
3282
+ actions (Optional[List[Union]]): Actions to perform
3283
+ agent (Optional[AgentOptions]): Agent configuration
3284
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
3285
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3286
+ **kwargs: Additional parameters to pass to the API
3287
+
3288
+ Returns:
3289
+ BatchScrapeResponse with:
3290
+ * success - Whether job started successfully
3291
+ * id - Unique identifier for the job
3292
+ * url - Status check URL
3293
+ * error - Error message if start failed
3294
+
3295
+ Raises:
3296
+ Exception: If job initiation fails
3297
+ """
3298
+ # Validate any additional kwargs
3299
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3300
+
3301
+ scrape_params = {}
3302
+
3303
+ # Add individual parameters
3304
+ if formats is not None:
3305
+ scrape_params['formats'] = formats
3306
+ if headers is not None:
3307
+ scrape_params['headers'] = headers
3308
+ if include_tags is not None:
3309
+ scrape_params['includeTags'] = include_tags
3310
+ if exclude_tags is not None:
3311
+ scrape_params['excludeTags'] = exclude_tags
3312
+ if only_main_content is not None:
3313
+ scrape_params['onlyMainContent'] = only_main_content
3314
+ if wait_for is not None:
3315
+ scrape_params['waitFor'] = wait_for
3316
+ if timeout is not None:
3317
+ scrape_params['timeout'] = timeout
3318
+ if location is not None:
3319
+ scrape_params['location'] = location.dict(exclude_none=True)
3320
+ if mobile is not None:
3321
+ scrape_params['mobile'] = mobile
3322
+ if skip_tls_verification is not None:
3323
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3324
+ if remove_base64_images is not None:
3325
+ scrape_params['removeBase64Images'] = remove_base64_images
3326
+ if block_ads is not None:
3327
+ scrape_params['blockAds'] = block_ads
3328
+ if proxy is not None:
3329
+ scrape_params['proxy'] = proxy
3330
+ if extract is not None:
3331
+ extract = self._ensure_schema_dict(extract)
3332
+ if isinstance(extract, dict) and "schema" in extract:
3333
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3334
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3335
+ if json_options is not None:
3336
+ json_options = self._ensure_schema_dict(json_options)
3337
+ if isinstance(json_options, dict) and "schema" in json_options:
3338
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3339
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3340
+ if actions:
3341
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
3342
+ if agent is not None:
3343
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3344
+ if zero_data_retention is not None:
3345
+ scrape_params['zeroDataRetention'] = zero_data_retention
3346
+
3347
+ # Add any additional kwargs
3348
+ scrape_params.update(kwargs)
3349
+
3350
+ # Create final params object
3351
+ final_params = ScrapeParams(**scrape_params)
3352
+ params_dict = final_params.dict(exclude_none=True)
3353
+ params_dict['urls'] = urls
3354
+ params_dict['origin'] = f"python-sdk@{version}"
3355
+
3356
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3357
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3358
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3359
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3360
+
3361
+ # Make request
3362
+ headers = self._prepare_headers(idempotency_key)
3363
+ response = await self._async_post_request(
3364
+ f'{self.api_url}/v1/batch/scrape',
3365
+ params_dict,
3366
+ headers
3367
+ )
3368
+
3369
+ if response.get('status_code') == 200:
3370
+ try:
3371
+ return BatchScrapeResponse(**response.json())
3372
+ except:
3373
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3374
+ else:
3375
+ await self._handle_error(response, 'start batch scrape job')
3376
+
3377
+ async def crawl_url(
3378
+ self,
3379
+ url: str,
3380
+ *,
3381
+ include_paths: Optional[List[str]] = None,
3382
+ exclude_paths: Optional[List[str]] = None,
3383
+ max_depth: Optional[int] = None,
3384
+ max_discovery_depth: Optional[int] = None,
3385
+ limit: Optional[int] = None,
3386
+ allow_backward_links: Optional[bool] = None,
3387
+ crawl_entire_domain: Optional[bool] = None,
3388
+ allow_external_links: Optional[bool] = None,
3389
+ ignore_sitemap: Optional[bool] = None,
3390
+ scrape_options: Optional[ScrapeOptions] = None,
3391
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3392
+ deduplicate_similar_urls: Optional[bool] = None,
3393
+ ignore_query_parameters: Optional[bool] = None,
3394
+ regex_on_full_url: Optional[bool] = None,
3395
+ delay: Optional[int] = None,
3396
+ allow_subdomains: Optional[bool] = None,
3397
+ poll_interval: Optional[int] = 2,
3398
+ idempotency_key: Optional[str] = None,
3399
+ **kwargs
3400
+ ) -> CrawlStatusResponse:
3401
+ """
3402
+ Crawl a website starting from a URL.
3403
+
3404
+ Args:
3405
+ url (str): Target URL to start crawling from
3406
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3407
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3408
+ max_depth (Optional[int]): Maximum crawl depth
3409
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3410
+ limit (Optional[int]): Maximum pages to crawl
3411
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3412
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3413
+ allow_external_links (Optional[bool]): Follow external domain links
3414
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3415
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3416
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3417
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3418
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3419
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3420
+ delay (Optional[int]): Delay in seconds between scrapes
3421
+ allow_subdomains (Optional[bool]): Follow subdomains
3422
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3423
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3424
+ **kwargs: Additional parameters to pass to the API
3425
+
3426
+ Returns:
3427
+ CrawlStatusResponse with:
3428
+ * Crawling status and progress
3429
+ * Crawled page contents
3430
+ * Success/error information
3431
+
3432
+ Raises:
3433
+ Exception: If crawl fails
3434
+ """
3435
+ # Validate any additional kwargs
3436
+ self._validate_kwargs(kwargs, "crawl_url")
3437
+
3438
+ crawl_params = {}
3439
+
3440
+ # Add individual parameters
3441
+ if include_paths is not None:
3442
+ crawl_params['includePaths'] = include_paths
3443
+ if exclude_paths is not None:
3444
+ crawl_params['excludePaths'] = exclude_paths
3445
+ if max_depth is not None:
3446
+ crawl_params['maxDepth'] = max_depth
3447
+ if max_discovery_depth is not None:
3448
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3449
+ if limit is not None:
3450
+ crawl_params['limit'] = limit
3451
+ if crawl_entire_domain is not None:
3452
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3453
+ elif allow_backward_links is not None:
3454
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3455
+ if allow_external_links is not None:
3456
+ crawl_params['allowExternalLinks'] = allow_external_links
3457
+ if ignore_sitemap is not None:
3458
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3459
+ if scrape_options is not None:
3460
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3461
+ if webhook is not None:
3462
+ crawl_params['webhook'] = webhook
3463
+ if deduplicate_similar_urls is not None:
3464
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3465
+ if ignore_query_parameters is not None:
3466
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3467
+ if regex_on_full_url is not None:
3468
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3469
+ if delay is not None:
3470
+ crawl_params['delay'] = delay
3471
+ if allow_subdomains is not None:
3472
+ crawl_params['allowSubdomains'] = allow_subdomains
3473
+
3474
+ # Add any additional kwargs
3475
+ crawl_params.update(kwargs)
3476
+
3477
+ # Create final params object
3478
+ final_params = CrawlParams(**crawl_params)
3479
+ params_dict = final_params.dict(exclude_none=True)
3480
+ params_dict['url'] = url
3481
+ params_dict['origin'] = f"python-sdk@{version}"
3482
+ # Make request
3483
+ headers = self._prepare_headers(idempotency_key)
3484
+ response = await self._async_post_request(
3485
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3486
+
3487
+ if response.get('success'):
3488
+ try:
3489
+ id = response.get('id')
3490
+ except:
3491
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3492
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3493
+ else:
3494
+ await self._handle_error(response, 'start crawl job')
3495
+
3496
+
3497
+ async def async_crawl_url(
3498
+ self,
3499
+ url: str,
3500
+ *,
3501
+ include_paths: Optional[List[str]] = None,
3502
+ exclude_paths: Optional[List[str]] = None,
3503
+ max_depth: Optional[int] = None,
3504
+ max_discovery_depth: Optional[int] = None,
3505
+ limit: Optional[int] = None,
3506
+ allow_backward_links: Optional[bool] = None,
3507
+ crawl_entire_domain: Optional[bool] = None,
3508
+ allow_external_links: Optional[bool] = None,
3509
+ ignore_sitemap: Optional[bool] = None,
3510
+ scrape_options: Optional[ScrapeOptions] = None,
3511
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3512
+ deduplicate_similar_urls: Optional[bool] = None,
3513
+ ignore_query_parameters: Optional[bool] = None,
3514
+ regex_on_full_url: Optional[bool] = None,
3515
+ delay: Optional[int] = None,
3516
+ allow_subdomains: Optional[bool] = None,
3517
+ poll_interval: Optional[int] = 2,
3518
+ idempotency_key: Optional[str] = None,
3519
+ **kwargs
3520
+ ) -> CrawlResponse:
3521
+ """
3522
+ Start an asynchronous crawl job.
3523
+
3524
+ Args:
3525
+ url (str): Target URL to start crawling from
3526
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3527
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3528
+ max_depth (Optional[int]): Maximum crawl depth
3529
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3530
+ limit (Optional[int]): Maximum pages to crawl
3531
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3532
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3533
+ allow_external_links (Optional[bool]): Follow external domain links
3534
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3535
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3536
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3537
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3538
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3539
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3540
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3541
+ **kwargs: Additional parameters to pass to the API
3542
+
3543
+ Returns:
3544
+ CrawlResponse with:
3545
+ * success - Whether crawl started successfully
3546
+ * id - Unique identifier for the crawl job
3547
+ * url - Status check URL for the crawl
3548
+ * error - Error message if start failed
3549
+
3550
+ Raises:
3551
+ Exception: If crawl initiation fails
3552
+ """
3553
+ crawl_params = {}
3554
+
3555
+ # Add individual parameters
3556
+ if include_paths is not None:
3557
+ crawl_params['includePaths'] = include_paths
3558
+ if exclude_paths is not None:
3559
+ crawl_params['excludePaths'] = exclude_paths
3560
+ if max_depth is not None:
3561
+ crawl_params['maxDepth'] = max_depth
3562
+ if max_discovery_depth is not None:
3563
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3564
+ if limit is not None:
3565
+ crawl_params['limit'] = limit
3566
+ if crawl_entire_domain is not None:
3567
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3568
+ elif allow_backward_links is not None:
3569
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3570
+ if allow_external_links is not None:
3571
+ crawl_params['allowExternalLinks'] = allow_external_links
3572
+ if ignore_sitemap is not None:
3573
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3574
+ if scrape_options is not None:
3575
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3576
+ if webhook is not None:
3577
+ crawl_params['webhook'] = webhook
3578
+ if deduplicate_similar_urls is not None:
3579
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3580
+ if ignore_query_parameters is not None:
3581
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3582
+ if regex_on_full_url is not None:
3583
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3584
+ if delay is not None:
3585
+ crawl_params['delay'] = delay
3586
+ if allow_subdomains is not None:
3587
+ crawl_params['allowSubdomains'] = allow_subdomains
3588
+
3589
+ # Add any additional kwargs
3590
+ crawl_params.update(kwargs)
3591
+
3592
+ # Create final params object
3593
+ final_params = CrawlParams(**crawl_params)
3594
+ params_dict = final_params.dict(exclude_none=True)
3595
+ params_dict['url'] = url
3596
+ params_dict['origin'] = f"python-sdk@{version}"
3597
+
3598
+ # Make request
3599
+ headers = self._prepare_headers(idempotency_key)
3600
+ response = await self._async_post_request(
3601
+ f'{self.api_url}/v1/crawl',
3602
+ params_dict,
3603
+ headers
3604
+ )
3605
+
3606
+ if response.get('success'):
3607
+ try:
3608
+ return CrawlResponse(**response)
3609
+ except:
3610
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3611
+ else:
3612
+ await self._handle_error(response, 'start crawl job')
3613
+
3614
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3615
+ """
3616
+ Check the status and results of an asynchronous crawl job.
3617
+
3618
+ Args:
3619
+ id (str): Unique identifier for the crawl job
3620
+
3621
+ Returns:
3622
+ CrawlStatusResponse containing:
3623
+ Status Information:
3624
+ * status - Current state (scraping/completed/failed/cancelled)
3625
+ * completed - Number of pages crawled
3626
+ * total - Total pages to crawl
3627
+ * creditsUsed - API credits consumed
3628
+ * expiresAt - Data expiration timestamp
3629
+
3630
+ Results:
3631
+ * data - List of crawled documents
3632
+ * next - URL for next page of results (if paginated)
3633
+ * success - Whether status check succeeded
3634
+ * error - Error message if failed
3635
+
3636
+ Raises:
3637
+ Exception: If status check fails
3638
+ """
3639
+ headers = self._prepare_headers()
3640
+ endpoint = f'/v1/crawl/{id}'
3641
+
3642
+ status_data = await self._async_get_request(
3643
+ f'{self.api_url}{endpoint}',
3644
+ headers
3645
+ )
3646
+
3647
+ if status_data.get('status') == 'completed':
3648
+ if 'data' in status_data:
3649
+ data = status_data['data']
3650
+ while 'next' in status_data:
3651
+ if len(status_data['data']) == 0:
3652
+ break
3653
+ next_url = status_data.get('next')
3654
+ if not next_url:
3655
+ logger.warning("Expected 'next' URL is missing.")
3656
+ break
3657
+ next_data = await self._async_get_request(next_url, headers)
3658
+ data.extend(next_data.get('data', []))
3659
+ status_data = next_data
3660
+ status_data['data'] = data
3661
+ # Create CrawlStatusResponse object from status data
3662
+ response = CrawlStatusResponse(
3663
+ status=status_data.get('status'),
3664
+ total=status_data.get('total'),
3665
+ completed=status_data.get('completed'),
3666
+ creditsUsed=status_data.get('creditsUsed'),
3667
+ expiresAt=status_data.get('expiresAt'),
3668
+ data=status_data.get('data'),
3669
+ success=False if 'error' in status_data else True
3670
+ )
3671
+
3672
+ if 'error' in status_data:
3673
+ response.error = status_data.get('error')
3674
+
3675
+ if 'next' in status_data:
3676
+ response.next = status_data.get('next')
3677
+
3678
+ return response
3679
+
3680
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3681
+ """
3682
+ Monitor the status of an asynchronous job until completion.
3683
+
3684
+ Args:
3685
+ id (str): The ID of the job to monitor
3686
+ headers (Dict[str, str]): Headers to include in status check requests
3687
+ poll_interval (int): Seconds between status checks (default: 2)
3688
+
3689
+ Returns:
3690
+ CrawlStatusResponse: The job results if completed successfully
3691
+
3692
+ Raises:
3693
+ Exception: If the job fails or an error occurs during status checks
3694
+ """
3695
+ while True:
3696
+ status_data = await self._async_get_request(
3697
+ f'{self.api_url}/v1/crawl/{id}',
3698
+ headers
3699
+ )
3700
+
3701
+ if status_data.get('status') == 'completed':
3702
+ if 'data' in status_data:
3703
+ data = status_data['data']
3704
+ while 'next' in status_data:
3705
+ if len(status_data['data']) == 0:
3706
+ break
3707
+ next_url = status_data.get('next')
3708
+ if not next_url:
3709
+ logger.warning("Expected 'next' URL is missing.")
3710
+ break
3711
+ next_data = await self._async_get_request(next_url, headers)
3712
+ data.extend(next_data.get('data', []))
3713
+ status_data = next_data
3714
+ status_data['data'] = data
3715
+ return CrawlStatusResponse(**status_data)
3716
+ else:
3717
+ raise Exception('Job completed but no data was returned')
3718
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3719
+ await asyncio.sleep(max(poll_interval, 2))
3720
+ else:
3721
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3722
+
3723
+ async def map_url(
3724
+ self,
3725
+ url: str,
3726
+ *,
3727
+ search: Optional[str] = None,
3728
+ ignore_sitemap: Optional[bool] = None,
3729
+ include_subdomains: Optional[bool] = None,
3730
+ sitemap_only: Optional[bool] = None,
3731
+ limit: Optional[int] = None,
3732
+ timeout: Optional[int] = None,
3733
+ params: Optional[MapParams] = None) -> MapResponse:
3734
+ """
3735
+ Asynchronously map and discover links from a URL.
3736
+
3737
+ Args:
3738
+ url (str): Target URL to map
3739
+ params (Optional[MapParams]): See MapParams model:
3740
+ Discovery Options:
3741
+ * search - Filter pattern for URLs
3742
+ * ignoreSitemap - Skip sitemap.xml
3743
+ * includeSubdomains - Include subdomain links
3744
+ * sitemapOnly - Only use sitemap.xml
3745
+
3746
+ Limits:
3747
+ * limit - Max URLs to return
3748
+ * timeout - Request timeout (ms)
3749
+
3750
+ Returns:
3751
+ MapResponse with:
3752
+ * Discovered URLs
3753
+ * Success/error status
3754
+
3755
+ Raises:
3756
+ Exception: If mapping fails
3757
+ """
3758
+ map_params = {}
3759
+ if params:
3760
+ map_params.update(params.dict(exclude_none=True))
3761
+
3762
+ # Add individual parameters
3763
+ if search is not None:
3764
+ map_params['search'] = search
3765
+ if ignore_sitemap is not None:
3766
+ map_params['ignoreSitemap'] = ignore_sitemap
3767
+ if include_subdomains is not None:
3768
+ map_params['includeSubdomains'] = include_subdomains
3769
+ if sitemap_only is not None:
3770
+ map_params['sitemapOnly'] = sitemap_only
3771
+ if limit is not None:
3772
+ map_params['limit'] = limit
3773
+ if timeout is not None:
3774
+ map_params['timeout'] = timeout
3775
+
3776
+ # Create final params object
3777
+ final_params = MapParams(**map_params)
3778
+ params_dict = final_params.dict(exclude_none=True)
3779
+ params_dict['url'] = url
3780
+ params_dict['origin'] = f"python-sdk@{version}"
3781
+
3782
+ # Make request
3783
+ endpoint = f'/v1/map'
3784
+ response = await self._async_post_request(
3785
+ f'{self.api_url}{endpoint}',
3786
+ params_dict,
3787
+ headers={"Authorization": f"Bearer {self.api_key}"}
3788
+ )
3789
+
3790
+ if response.get('success') and 'links' in response:
3791
+ return MapResponse(**response)
3792
+ elif 'error' in response:
3793
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3794
+ else:
3795
+ raise Exception(f'Failed to map URL. Error: {response}')
3796
+
3797
+ async def extract(
3798
+ self,
3799
+ urls: Optional[List[str]] = None,
3800
+ *,
3801
+ prompt: Optional[str] = None,
3802
+ schema: Optional[Any] = None,
3803
+ system_prompt: Optional[str] = None,
3804
+ allow_external_links: Optional[bool] = False,
3805
+ enable_web_search: Optional[bool] = False,
3806
+ show_sources: Optional[bool] = False,
3807
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3808
+
3809
+ """
3810
+ Asynchronously extract structured information from URLs.
3811
+
3812
+ Args:
3813
+ urls (Optional[List[str]]): URLs to extract from
3814
+ prompt (Optional[str]): Custom extraction prompt
3815
+ schema (Optional[Any]): JSON schema/Pydantic model
3816
+ system_prompt (Optional[str]): System context
3817
+ allow_external_links (Optional[bool]): Follow external links
3818
+ enable_web_search (Optional[bool]): Enable web search
3819
+ show_sources (Optional[bool]): Include source URLs
3820
+ agent (Optional[Dict[str, Any]]): Agent configuration
3821
+
3822
+ Returns:
3823
+ ExtractResponse with:
3824
+ * Structured data matching schema
3825
+ * Source information if requested
3826
+ * Success/error status
3827
+
3828
+ Raises:
3829
+ ValueError: If prompt/schema missing or extraction fails
3830
+ """
3831
+ headers = self._prepare_headers()
3832
+
3833
+ if not prompt and not schema:
3834
+ raise ValueError("Either prompt or schema is required")
3835
+
3836
+ if not urls and not prompt:
3837
+ raise ValueError("Either urls or prompt is required")
3838
+
3839
+ if schema:
3840
+ schema = self._ensure_schema_dict(schema)
3841
+
3842
+ request_data = {
3843
+ 'urls': urls or [],
3844
+ 'allowExternalLinks': allow_external_links,
3845
+ 'enableWebSearch': enable_web_search,
3846
+ 'showSources': show_sources,
3847
+ 'schema': schema,
3848
+ 'origin': f'python-sdk@{get_version()}'
3849
+ }
3850
+
3851
+ # Only add prompt and systemPrompt if they exist
3852
+ if prompt:
3853
+ request_data['prompt'] = prompt
3854
+ if system_prompt:
3855
+ request_data['systemPrompt'] = system_prompt
3856
+
3857
+ if agent:
3858
+ request_data['agent'] = agent
3859
+
3860
+ response = await self._async_post_request(
3861
+ f'{self.api_url}/v1/extract',
3862
+ request_data,
3863
+ headers
3864
+ )
3865
+
3866
+ if response.get('success'):
3867
+ job_id = response.get('id')
3868
+ if not job_id:
3869
+ raise Exception('Job ID not returned from extract request.')
3870
+
3871
+ while True:
3872
+ status_data = await self._async_get_request(
3873
+ f'{self.api_url}/v1/extract/{job_id}',
3874
+ headers
3875
+ )
3876
+
3877
+ if status_data['status'] == 'completed':
3878
+ return ExtractResponse(**status_data)
3879
+ elif status_data['status'] in ['failed', 'cancelled']:
3880
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3881
+
3882
+ await asyncio.sleep(2)
3883
+ else:
3884
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3885
+
3886
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3887
+ """
3888
+ Check the status of an asynchronous batch scrape job.
3889
+
3890
+ Args:
3891
+ id (str): The ID of the batch scrape job
3892
+
3893
+ Returns:
3894
+ BatchScrapeStatusResponse containing:
3895
+ Status Information:
3896
+ * status - Current state (scraping/completed/failed/cancelled)
3897
+ * completed - Number of URLs scraped
3898
+ * total - Total URLs to scrape
3899
+ * creditsUsed - API credits consumed
3900
+ * expiresAt - Data expiration timestamp
3901
+
3902
+ Results:
3903
+ * data - List of scraped documents
3904
+ * next - URL for next page of results (if paginated)
3905
+ * success - Whether status check succeeded
3906
+ * error - Error message if failed
3907
+
3908
+ Raises:
3909
+ Exception: If status check fails
3910
+ """
3911
+ headers = self._prepare_headers()
3912
+ endpoint = f'/v1/batch/scrape/{id}'
3913
+
3914
+ status_data = await self._async_get_request(
3915
+ f'{self.api_url}{endpoint}',
3916
+ headers
3917
+ )
3918
+
3919
+ if status_data['status'] == 'completed':
3920
+ if 'data' in status_data:
3921
+ data = status_data['data']
3922
+ while 'next' in status_data:
3923
+ if len(status_data['data']) == 0:
3924
+ break
3925
+ next_url = status_data.get('next')
3926
+ if not next_url:
3927
+ logger.warning("Expected 'next' URL is missing.")
3928
+ break
3929
+ next_data = await self._async_get_request(next_url, headers)
3930
+ data.extend(next_data.get('data', []))
3931
+ status_data = next_data
3932
+ status_data['data'] = data
3933
+
3934
+ response = BatchScrapeStatusResponse(
3935
+ status=status_data.get('status'),
3936
+ total=status_data.get('total'),
3937
+ completed=status_data.get('completed'),
3938
+ creditsUsed=status_data.get('creditsUsed'),
3939
+ expiresAt=status_data.get('expiresAt'),
3940
+ data=status_data.get('data')
3941
+ )
3942
+
3943
+ if 'error' in status_data:
3944
+ response['error'] = status_data['error']
3945
+
3946
+ if 'next' in status_data:
3947
+ response['next'] = status_data['next']
3948
+
3949
+ return {
3950
+ 'success': False if 'error' in status_data else True,
3951
+ **response
3952
+ }
3953
+
3954
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3955
+ """
3956
+ Get information about errors from an asynchronous batch scrape job.
3957
+
3958
+ Args:
3959
+ id (str): The ID of the batch scrape job
3960
+
3961
+ Returns:
3962
+ CrawlErrorsResponse containing:
3963
+ errors (List[Dict[str, str]]): List of errors with fields:
3964
+ * id (str): Error ID
3965
+ * timestamp (str): When the error occurred
3966
+ * url (str): URL that caused the error
3967
+ * error (str): Error message
3968
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3969
+
3970
+ Raises:
3971
+ Exception: If error check fails
3972
+ """
3973
+ headers = self._prepare_headers()
3974
+ return await self._async_get_request(
3975
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3976
+ headers
3977
+ )
3978
+
3979
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3980
+ """
3981
+ Get information about errors from an asynchronous crawl job.
3982
+
3983
+ Args:
3984
+ id (str): The ID of the crawl job
3985
+
3986
+ Returns:
3987
+ CrawlErrorsResponse containing:
3988
+ * errors (List[Dict[str, str]]): List of errors with fields:
3989
+ - id (str): Error ID
3990
+ - timestamp (str): When the error occurred
3991
+ - url (str): URL that caused the error
3992
+ - error (str): Error message
3993
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3994
+
3995
+ Raises:
3996
+ Exception: If error check fails
3997
+ """
3998
+ headers = self._prepare_headers()
3999
+ return await self._async_get_request(
4000
+ f'{self.api_url}/v1/crawl/{id}/errors',
4001
+ headers
4002
+ )
4003
+
4004
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
4005
+ """
4006
+ Cancel an asynchronous crawl job.
4007
+
4008
+ Args:
4009
+ id (str): The ID of the crawl job to cancel
4010
+
4011
+ Returns:
4012
+ Dict[str, Any] containing:
4013
+ * success (bool): Whether cancellation was successful
4014
+ * error (str, optional): Error message if cancellation failed
4015
+
4016
+ Raises:
4017
+ Exception: If cancellation fails
4018
+ """
4019
+ headers = self._prepare_headers()
4020
+ async with aiohttp.ClientSession() as session:
4021
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
4022
+ return await response.json()
4023
+
4024
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
4025
+ """
4026
+ Check the status of an asynchronous extraction job.
4027
+
4028
+ Args:
4029
+ job_id (str): The ID of the extraction job
4030
+
4031
+ Returns:
4032
+ ExtractResponse[Any] with:
4033
+ * success (bool): Whether request succeeded
4034
+ * data (Optional[Any]): Extracted data matching schema
4035
+ * error (Optional[str]): Error message if any
4036
+ * warning (Optional[str]): Warning message if any
4037
+ * sources (Optional[List[str]]): Source URLs if requested
4038
+
4039
+ Raises:
4040
+ ValueError: If status check fails
4041
+ """
4042
+ headers = self._prepare_headers()
4043
+ try:
4044
+ return await self._async_get_request(
4045
+ f'{self.api_url}/v1/extract/{job_id}',
4046
+ headers
4047
+ )
4048
+ except Exception as e:
4049
+ raise ValueError(str(e))
4050
+
4051
+ async def async_extract(
4052
+ self,
4053
+ urls: Optional[List[str]] = None,
4054
+ *,
4055
+ prompt: Optional[str] = None,
4056
+ schema: Optional[Any] = None,
4057
+ system_prompt: Optional[str] = None,
4058
+ allow_external_links: Optional[bool] = False,
4059
+ enable_web_search: Optional[bool] = False,
4060
+ show_sources: Optional[bool] = False,
4061
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
4062
+ """
4063
+ Initiate an asynchronous extraction job without waiting for completion.
4064
+
4065
+ Args:
4066
+ urls (Optional[List[str]]): URLs to extract from
4067
+ prompt (Optional[str]): Custom extraction prompt
4068
+ schema (Optional[Any]): JSON schema/Pydantic model
4069
+ system_prompt (Optional[str]): System context
4070
+ allow_external_links (Optional[bool]): Follow external links
4071
+ enable_web_search (Optional[bool]): Enable web search
4072
+ show_sources (Optional[bool]): Include source URLs
4073
+ agent (Optional[Dict[str, Any]]): Agent configuration
4074
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
4075
+
4076
+ Returns:
4077
+ ExtractResponse[Any] with:
4078
+ * success (bool): Whether request succeeded
4079
+ * data (Optional[Any]): Extracted data matching schema
4080
+ * error (Optional[str]): Error message if any
4081
+
4082
+ Raises:
4083
+ ValueError: If job initiation fails
4084
+ """
4085
+ headers = self._prepare_headers()
4086
+
4087
+ if not prompt and not schema:
4088
+ raise ValueError("Either prompt or schema is required")
4089
+
4090
+ if not urls and not prompt:
4091
+ raise ValueError("Either urls or prompt is required")
4092
+
4093
+ if schema:
4094
+ schema = self._ensure_schema_dict(schema)
4095
+
4096
+ request_data = ExtractResponse(
4097
+ urls=urls or [],
4098
+ allowExternalLinks=allow_external_links,
4099
+ enableWebSearch=enable_web_search,
4100
+ showSources=show_sources,
4101
+ schema=schema,
4102
+ origin=f'python-sdk@{version}'
4103
+ )
4104
+
4105
+ if prompt:
4106
+ request_data['prompt'] = prompt
4107
+ if system_prompt:
4108
+ request_data['systemPrompt'] = system_prompt
4109
+ if agent:
4110
+ request_data['agent'] = agent
4111
+
4112
+ try:
4113
+ return await self._async_post_request(
4114
+ f'{self.api_url}/v1/extract',
4115
+ request_data,
4116
+ headers
4117
+ )
4118
+ except Exception as e:
4119
+ raise ValueError(str(e))
4120
+
4121
+ async def generate_llms_text(
4122
+ self,
4123
+ url: str,
4124
+ *,
4125
+ max_urls: Optional[int] = None,
4126
+ show_full_text: Optional[bool] = None,
4127
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
4128
+ """
4129
+ Generate LLMs.txt for a given URL and monitor until completion.
4130
+
4131
+ Args:
4132
+ url (str): Target URL to generate LLMs.txt from
4133
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4134
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4135
+ experimental_stream (Optional[bool]): Enable experimental streaming
4136
+
4137
+ Returns:
4138
+ GenerateLLMsTextStatusResponse containing:
4139
+ * success (bool): Whether generation completed successfully
4140
+ * status (str): Status of generation (processing/completed/failed)
4141
+ * data (Dict[str, str], optional): Generated text with fields:
4142
+ - llmstxt (str): Generated LLMs.txt content
4143
+ - llmsfulltxt (str, optional): Full version if requested
4144
+ * error (str, optional): Error message if generation failed
4145
+ * expiresAt (str): When the generated data expires
4146
+
4147
+ Raises:
4148
+ Exception: If generation fails
4149
+ """
4150
+ params = {}
4151
+ if max_urls is not None:
4152
+ params['maxUrls'] = max_urls
4153
+ if show_full_text is not None:
4154
+ params['showFullText'] = show_full_text
4155
+ if experimental_stream is not None:
4156
+ params['__experimental_stream'] = experimental_stream
4157
+
4158
+ response = await self.async_generate_llms_text(
4159
+ url,
4160
+ max_urls=max_urls,
4161
+ show_full_text=show_full_text,
4162
+ cache=cache,
4163
+ experimental_stream=experimental_stream
4164
+ )
4165
+ if not response.get('success') or 'id' not in response:
4166
+ return response
4167
+
4168
+ job_id = response['id']
4169
+ while True:
4170
+ status = await self.check_generate_llms_text_status(job_id)
4171
+
4172
+ if status['status'] == 'completed':
4173
+ return status
4174
+ elif status['status'] == 'failed':
4175
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4176
+ elif status['status'] != 'processing':
4177
+ break
4178
+
4179
+ await asyncio.sleep(2)
4180
+
4181
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4182
+
4183
+ async def async_generate_llms_text(
4184
+ self,
4185
+ url: str,
4186
+ *,
4187
+ max_urls: Optional[int] = None,
4188
+ show_full_text: Optional[bool] = None,
4189
+ cache: Optional[bool] = None,
4190
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4191
+ """
4192
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4193
+
4194
+ Args:
4195
+ url (str): Target URL to generate LLMs.txt from
4196
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4197
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4198
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
4199
+ experimental_stream (Optional[bool]): Enable experimental streaming
4200
+
4201
+ Returns:
4202
+ GenerateLLMsTextResponse containing:
4203
+ * success (bool): Whether job started successfully
4204
+ * id (str): Unique identifier for the job
4205
+ * error (str, optional): Error message if start failed
4206
+
4207
+ Raises:
4208
+ ValueError: If job initiation fails
4209
+ """
4210
+ params = {}
4211
+ if max_urls is not None:
4212
+ params['maxUrls'] = max_urls
4213
+ if show_full_text is not None:
4214
+ params['showFullText'] = show_full_text
4215
+ if experimental_stream is not None:
4216
+ params['__experimental_stream'] = experimental_stream
4217
+
4218
+ params = GenerateLLMsTextParams(
4219
+ maxUrls=max_urls,
4220
+ showFullText=show_full_text,
4221
+ cache=cache,
4222
+ __experimental_stream=experimental_stream
4223
+ )
4224
+
4225
+ headers = self._prepare_headers()
4226
+ json_data = {'url': url, **params.dict(exclude_none=True)}
4227
+ json_data['origin'] = f"python-sdk@{version}"
4228
+
4229
+ try:
4230
+ return await self._async_post_request(
4231
+ f'{self.api_url}/v1/llmstxt',
4232
+ json_data,
4233
+ headers
4234
+ )
4235
+ except Exception as e:
4236
+ raise ValueError(str(e))
4237
+
4238
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4239
+ """
4240
+ Check the status of an asynchronous LLMs.txt generation job.
4241
+
4242
+ Args:
4243
+ id (str): The ID of the generation job
4244
+
4245
+ Returns:
4246
+ GenerateLLMsTextStatusResponse containing:
4247
+ * success (bool): Whether generation completed successfully
4248
+ * status (str): Status of generation (processing/completed/failed)
4249
+ * data (Dict[str, str], optional): Generated text with fields:
4250
+ - llmstxt (str): Generated LLMs.txt content
4251
+ - llmsfulltxt (str, optional): Full version if requested
4252
+ * error (str, optional): Error message if generation failed
4253
+ * expiresAt (str): When the generated data expires
4254
+
4255
+ Raises:
4256
+ ValueError: If status check fails
4257
+ """
4258
+ headers = self._prepare_headers()
4259
+ try:
4260
+ return await self._async_get_request(
4261
+ f'{self.api_url}/v1/llmstxt/{id}',
4262
+ headers
4263
+ )
4264
+ except Exception as e:
4265
+ raise ValueError(str(e))
4266
+
4267
+ async def deep_research(
4268
+ self,
4269
+ query: str,
4270
+ *,
4271
+ max_depth: Optional[int] = None,
4272
+ time_limit: Optional[int] = None,
4273
+ max_urls: Optional[int] = None,
4274
+ analysis_prompt: Optional[str] = None,
4275
+ system_prompt: Optional[str] = None,
4276
+ __experimental_stream_steps: Optional[bool] = None,
4277
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4278
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4279
+ """
4280
+ Initiates a deep research operation on a given query and polls until completion.
4281
+
4282
+ Args:
4283
+ query (str): Research query or topic to investigate
4284
+ max_depth (Optional[int]): Maximum depth of research exploration
4285
+ time_limit (Optional[int]): Time limit in seconds for research
4286
+ max_urls (Optional[int]): Maximum number of URLs to process
4287
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4288
+ system_prompt (Optional[str]): Custom system prompt
4289
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4290
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4291
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4292
+
4293
+ Returns:
4294
+ DeepResearchStatusResponse containing:
4295
+ * success (bool): Whether research completed successfully
4296
+ * status (str): Current state (processing/completed/failed)
4297
+ * error (Optional[str]): Error message if failed
4298
+ * id (str): Unique identifier for the research job
4299
+ * data (Any): Research findings and analysis
4300
+ * sources (List[Dict]): List of discovered sources
4301
+ * activities (List[Dict]): Research progress log
4302
+ * summaries (List[str]): Generated research summaries
4303
+
4304
+ Raises:
4305
+ Exception: If research fails
4306
+ """
4307
+ research_params = {}
4308
+ if max_depth is not None:
4309
+ research_params['maxDepth'] = max_depth
4310
+ if time_limit is not None:
4311
+ research_params['timeLimit'] = time_limit
4312
+ if max_urls is not None:
4313
+ research_params['maxUrls'] = max_urls
4314
+ if analysis_prompt is not None:
4315
+ research_params['analysisPrompt'] = analysis_prompt
4316
+ if system_prompt is not None:
4317
+ research_params['systemPrompt'] = system_prompt
4318
+ if __experimental_stream_steps is not None:
4319
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4320
+ research_params = DeepResearchParams(**research_params)
4321
+
4322
+ response = await self.async_deep_research(
4323
+ query,
4324
+ max_depth=max_depth,
4325
+ time_limit=time_limit,
4326
+ max_urls=max_urls,
4327
+ analysis_prompt=analysis_prompt,
4328
+ system_prompt=system_prompt
4329
+ )
4330
+ if not response.get('success') or 'id' not in response:
4331
+ return response
4332
+
4333
+ job_id = response['id']
4334
+ last_activity_count = 0
4335
+ last_source_count = 0
4336
+
4337
+ while True:
4338
+ status = await self.check_deep_research_status(job_id)
4339
+
4340
+ if on_activity and 'activities' in status:
4341
+ new_activities = status['activities'][last_activity_count:]
4342
+ for activity in new_activities:
4343
+ on_activity(activity)
4344
+ last_activity_count = len(status['activities'])
4345
+
4346
+ if on_source and 'sources' in status:
4347
+ new_sources = status['sources'][last_source_count:]
4348
+ for source in new_sources:
4349
+ on_source(source)
4350
+ last_source_count = len(status['sources'])
4351
+
4352
+ if status['status'] == 'completed':
4353
+ return status
4354
+ elif status['status'] == 'failed':
4355
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4356
+ elif status['status'] != 'processing':
4357
+ break
4358
+
4359
+ await asyncio.sleep(2)
4360
+
4361
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4362
+
4363
+ async def async_deep_research(
4364
+ self,
4365
+ query: str,
4366
+ *,
4367
+ max_depth: Optional[int] = None,
4368
+ time_limit: Optional[int] = None,
4369
+ max_urls: Optional[int] = None,
4370
+ analysis_prompt: Optional[str] = None,
4371
+ system_prompt: Optional[str] = None,
4372
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4373
+ """
4374
+ Initiates an asynchronous deep research operation.
4375
+
4376
+ Args:
4377
+ query (str): Research query or topic to investigate
4378
+ max_depth (Optional[int]): Maximum depth of research exploration
4379
+ time_limit (Optional[int]): Time limit in seconds for research
4380
+ max_urls (Optional[int]): Maximum number of URLs to process
4381
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4382
+ system_prompt (Optional[str]): Custom system prompt
4383
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4384
+
4385
+ Returns:
4386
+ Dict[str, Any]: A response containing:
4387
+ * success (bool): Whether the research initiation was successful
4388
+ * id (str): The unique identifier for the research job
4389
+ * error (str, optional): Error message if initiation failed
4390
+
4391
+ Raises:
4392
+ Exception: If the research initiation fails.
4393
+ """
4394
+ research_params = {}
4395
+ if max_depth is not None:
4396
+ research_params['maxDepth'] = max_depth
4397
+ if time_limit is not None:
4398
+ research_params['timeLimit'] = time_limit
4399
+ if max_urls is not None:
4400
+ research_params['maxUrls'] = max_urls
4401
+ if analysis_prompt is not None:
4402
+ research_params['analysisPrompt'] = analysis_prompt
4403
+ if system_prompt is not None:
4404
+ research_params['systemPrompt'] = system_prompt
4405
+ if __experimental_stream_steps is not None:
4406
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4407
+ research_params = DeepResearchParams(**research_params)
4408
+
4409
+ headers = self._prepare_headers()
4410
+
4411
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4412
+ json_data['origin'] = f"python-sdk@{version}"
4413
+
4414
+ try:
4415
+ return await self._async_post_request(
4416
+ f'{self.api_url}/v1/deep-research',
4417
+ json_data,
4418
+ headers
4419
+ )
4420
+ except Exception as e:
4421
+ raise ValueError(str(e))
4422
+
4423
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4424
+ """
4425
+ Check the status of a deep research operation.
4426
+
4427
+ Args:
4428
+ id (str): The ID of the deep research operation.
4429
+
4430
+ Returns:
4431
+ DeepResearchResponse containing:
4432
+
4433
+ Status:
4434
+ * success - Whether research completed successfully
4435
+ * status - Current state (processing/completed/failed)
4436
+ * error - Error message if failed
4437
+
4438
+ Results:
4439
+ * id - Unique identifier for the research job
4440
+ * data - Research findings and analysis
4441
+ * sources - List of discovered sources
4442
+ * activities - Research progress log
4443
+ * summaries - Generated research summaries
4444
+
4445
+ Raises:
4446
+ Exception: If the status check fails.
4447
+ """
4448
+ headers = self._prepare_headers()
4449
+ try:
4450
+ return await self._async_get_request(
4451
+ f'{self.api_url}/v1/deep-research/{id}',
4452
+ headers
4453
+ )
4454
+ except Exception as e:
4455
+ raise ValueError(str(e))
4456
+
4457
+ async def search(
4458
+ self,
4459
+ query: str,
4460
+ *,
4461
+ limit: Optional[int] = None,
4462
+ tbs: Optional[str] = None,
4463
+ filter: Optional[str] = None,
4464
+ lang: Optional[str] = None,
4465
+ country: Optional[str] = None,
4466
+ location: Optional[str] = None,
4467
+ timeout: Optional[int] = None,
4468
+ scrape_options: Optional[ScrapeOptions] = None,
4469
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4470
+ **kwargs) -> SearchResponse:
4471
+ """
4472
+ Asynchronously search for content using Firecrawl.
4473
+
4474
+ Args:
4475
+ query (str): Search query string
4476
+ limit (Optional[int]): Max results (default: 5)
4477
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4478
+ filter (Optional[str]): Custom result filter
4479
+ lang (Optional[str]): Language code (default: "en")
4480
+ country (Optional[str]): Country code (default: "us")
4481
+ location (Optional[str]): Geo-targeting
4482
+ timeout (Optional[int]): Request timeout in milliseconds
4483
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4484
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4485
+ **kwargs: Additional keyword arguments for future compatibility
4486
+
4487
+ Returns:
4488
+ SearchResponse: Response containing:
4489
+ * success (bool): Whether request succeeded
4490
+ * data (List[FirecrawlDocument]): Search results
4491
+ * warning (Optional[str]): Warning message if any
4492
+ * error (Optional[str]): Error message if any
4493
+
4494
+ Raises:
4495
+ Exception: If search fails or response cannot be parsed
4496
+ """
4497
+ # Build search parameters
4498
+ search_params = {}
4499
+ if params:
4500
+ if isinstance(params, dict):
4501
+ search_params.update(params)
4502
+ else:
4503
+ search_params.update(params.dict(exclude_none=True))
4504
+
4505
+ # Add individual parameters
4506
+ if limit is not None:
4507
+ search_params['limit'] = limit
4508
+ if tbs is not None:
4509
+ search_params['tbs'] = tbs
4510
+ if filter is not None:
4511
+ search_params['filter'] = filter
4512
+ if lang is not None:
4513
+ search_params['lang'] = lang
4514
+ if country is not None:
4515
+ search_params['country'] = country
4516
+ if location is not None:
4517
+ search_params['location'] = location
4518
+ if timeout is not None:
4519
+ search_params['timeout'] = timeout
4520
+ if scrape_options is not None:
4521
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4522
+
4523
+ # Add any additional kwargs
4524
+ search_params.update(kwargs)
4525
+
4526
+ # Create final params object
4527
+ final_params = SearchParams(query=query, **search_params)
4528
+ params_dict = final_params.dict(exclude_none=True)
4529
+ params_dict['origin'] = f"python-sdk@{version}"
4530
+
4531
+ return await self._async_post_request(
4532
+ f"{self.api_url}/v1/search",
4533
+ params_dict,
4534
+ {"Authorization": f"Bearer {self.api_key}"}
4535
+ )
4536
+
4537
+ class AsyncCrawlWatcher(CrawlWatcher):
4538
+ """
4539
+ Async version of CrawlWatcher that properly handles async operations.
4540
+ """
4541
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4542
+ super().__init__(id, app)
4543
+
4544
+ async def connect(self) -> None:
4545
+ """
4546
+ Establishes async WebSocket connection and starts listening for messages.
4547
+ """
4548
+ async with websockets.connect(
4549
+ self.ws_url,
4550
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4551
+ ) as websocket:
4552
+ await self._listen(websocket)
4553
+
4554
+ async def _listen(self, websocket) -> None:
4555
+ """
4556
+ Listens for incoming WebSocket messages and handles them asynchronously.
4557
+
4558
+ Args:
4559
+ websocket: The WebSocket connection object
4560
+ """
4561
+ async for message in websocket:
4562
+ msg = json.loads(message)
4563
+ await self._handle_message(msg)
4564
+
4565
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4566
+ """
4567
+ Handles incoming WebSocket messages based on their type asynchronously.
4568
+
4569
+ Args:
4570
+ msg (Dict[str, Any]): The message to handle
4571
+ """
4572
+ if msg['type'] == 'done':
4573
+ self.status = 'completed'
4574
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4575
+ elif msg['type'] == 'error':
4576
+ self.status = 'failed'
4577
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4578
+ elif msg['type'] == 'catchup':
4579
+ self.status = msg['data']['status']
4580
+ self.data.extend(msg['data'].get('data', []))
4581
+ for doc in self.data:
4582
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4583
+ elif msg['type'] == 'document':
4584
+ self.data.append(msg['data'])
4585
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4586
+
4587
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4588
+ """
4589
+ Handle errors from async API responses.
4590
+ """
4591
+ try:
4592
+ error_data = await response.json()
4593
+ error_message = error_data.get('error', 'No error message provided.')
4594
+ error_details = error_data.get('details', 'No additional error details provided.')
4595
+ except:
4596
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4597
+
4598
+ # Use the app's method to get the error message
4599
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4600
+
4601
+ raise aiohttp.ClientError(message)
4602
+
4603
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4604
+ """
4605
+ Generate a standardized error message based on HTTP status code for async operations.
4606
+
4607
+ Args:
4608
+ status_code (int): The HTTP status code from the response
4609
+ action (str): Description of the action that was being performed
4610
+ error_message (str): The error message from the API response
4611
+ error_details (str): Additional error details from the API response
4612
+
4613
+ Returns:
4614
+ str: A formatted error message
4615
+ """
4616
+ return self._get_error_message(status_code, action, error_message, error_details)