firecrawl-py 2.16.1__py3-none-any.whl → 2.16.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

@@ -0,0 +1,4619 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
47
+
48
+ logger : logging.Logger = logging.getLogger("firecrawl")
49
+
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+ pdfs: List[str]
100
+
101
+ class ChangeTrackingData(pydantic.BaseModel):
102
+ """
103
+ Data for the change tracking format.
104
+ """
105
+ previousScrapeAt: Optional[str] = None
106
+ changeStatus: str # "new" | "same" | "changed" | "removed"
107
+ visibility: str # "visible" | "hidden"
108
+ diff: Optional[Dict[str, Any]] = None
109
+ json: Optional[Any] = None
110
+
111
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
112
+ """Document retrieved or processed by Firecrawl."""
113
+ url: Optional[str] = None
114
+ markdown: Optional[str] = None
115
+ html: Optional[str] = None
116
+ rawHtml: Optional[str] = None
117
+ links: Optional[List[str]] = None
118
+ extract: Optional[T] = None
119
+ json: Optional[T] = None
120
+ screenshot: Optional[str] = None
121
+ metadata: Optional[Any] = None
122
+ actions: Optional[ActionsResult] = None
123
+ title: Optional[str] = None # v1 search only
124
+ description: Optional[str] = None # v1 search only
125
+ changeTracking: Optional[ChangeTrackingData] = None
126
+
127
+ class LocationConfig(pydantic.BaseModel):
128
+ """Location configuration for scraping."""
129
+ country: Optional[str] = None
130
+ languages: Optional[List[str]] = None
131
+
132
+ class WebhookConfig(pydantic.BaseModel):
133
+ """Configuration for webhooks."""
134
+ url: str
135
+ headers: Optional[Dict[str, str]] = None
136
+ metadata: Optional[Dict[str, str]] = None
137
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
138
+
139
+ class ChangeTrackingOptions(pydantic.BaseModel):
140
+ """Configuration for change tracking."""
141
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
142
+ schema: Optional[Any] = None
143
+ prompt: Optional[str] = None
144
+ tag: Optional[str] = None
145
+
146
+ class ScrapeOptions(pydantic.BaseModel):
147
+ """Parameters for scraping operations."""
148
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
149
+ headers: Optional[Dict[str, str]] = None
150
+ includeTags: Optional[List[str]] = None
151
+ excludeTags: Optional[List[str]] = None
152
+ onlyMainContent: Optional[bool] = None
153
+ waitFor: Optional[int] = None
154
+ timeout: Optional[int] = None
155
+ location: Optional[LocationConfig] = None
156
+ mobile: Optional[bool] = None
157
+ skipTlsVerification: Optional[bool] = None
158
+ removeBase64Images: Optional[bool] = None
159
+ blockAds: Optional[bool] = None
160
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
161
+ changeTrackingOptions: Optional[ChangeTrackingOptions] = None
162
+ maxAge: Optional[int] = None
163
+ storeInCache: Optional[bool] = None
164
+ parsePDF: Optional[bool] = None
165
+
166
+ class WaitAction(pydantic.BaseModel):
167
+ """Wait action to perform during scraping."""
168
+ type: Literal["wait"]
169
+ milliseconds: Optional[int] = None
170
+ selector: Optional[str] = None
171
+
172
+ class ScreenshotAction(pydantic.BaseModel):
173
+ """Screenshot action to perform during scraping."""
174
+ type: Literal["screenshot"]
175
+ fullPage: Optional[bool] = None
176
+ quality: Optional[int] = None
177
+
178
+ class ClickAction(pydantic.BaseModel):
179
+ """Click action to perform during scraping."""
180
+ type: Literal["click"]
181
+ selector: str
182
+
183
+ class WriteAction(pydantic.BaseModel):
184
+ """Write action to perform during scraping."""
185
+ type: Literal["write"]
186
+ text: str
187
+
188
+ class PressAction(pydantic.BaseModel):
189
+ """Press action to perform during scraping."""
190
+ type: Literal["press"]
191
+ key: str
192
+
193
+ class ScrollAction(pydantic.BaseModel):
194
+ """Scroll action to perform during scraping."""
195
+ type: Literal["scroll"]
196
+ direction: Literal["up", "down"]
197
+ selector: Optional[str] = None
198
+
199
+ class ScrapeAction(pydantic.BaseModel):
200
+ """Scrape action to perform during scraping."""
201
+ type: Literal["scrape"]
202
+
203
+ class ExecuteJavascriptAction(pydantic.BaseModel):
204
+ """Execute javascript action to perform during scraping."""
205
+ type: Literal["executeJavascript"]
206
+ script: str
207
+
208
+ class PDFAction(pydantic.BaseModel):
209
+ """PDF action to perform during scraping."""
210
+ type: Literal["pdf"]
211
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
212
+ landscape: Optional[bool] = None
213
+ scale: Optional[float] = None
214
+
215
+ class ExtractAgent(pydantic.BaseModel):
216
+ """Configuration for the agent in extract operations."""
217
+ model: Literal["FIRE-1"] = "FIRE-1"
218
+
219
+ class JsonConfig(pydantic.BaseModel):
220
+ """Configuration for extraction."""
221
+ prompt: Optional[str] = None
222
+ schema: Optional[Any] = None
223
+ systemPrompt: Optional[str] = None
224
+ agent: Optional[ExtractAgent] = None
225
+
226
+ class ScrapeParams(ScrapeOptions):
227
+ """Parameters for scraping operations."""
228
+ extract: Optional[JsonConfig] = None
229
+ jsonOptions: Optional[JsonConfig] = None
230
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
231
+ agent: Optional[AgentOptions] = None
232
+ webhook: Optional[WebhookConfig] = None
233
+
234
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
235
+ """Response from scraping operations."""
236
+ success: bool = True
237
+ warning: Optional[str] = None
238
+ error: Optional[str] = None
239
+
240
+ class BatchScrapeResponse(pydantic.BaseModel):
241
+ """Response from batch scrape operations."""
242
+ id: Optional[str] = None
243
+ url: Optional[str] = None
244
+ success: bool = True
245
+ error: Optional[str] = None
246
+ invalidURLs: Optional[List[str]] = None
247
+
248
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
249
+ """Response from batch scrape status checks."""
250
+ success: bool = True
251
+ status: Literal["scraping", "completed", "failed", "cancelled"]
252
+ completed: int
253
+ total: int
254
+ creditsUsed: int
255
+ expiresAt: datetime
256
+ next: Optional[str] = None
257
+ data: List[FirecrawlDocument]
258
+
259
+ class CrawlParams(pydantic.BaseModel):
260
+ """Parameters for crawling operations."""
261
+ includePaths: Optional[List[str]] = None
262
+ excludePaths: Optional[List[str]] = None
263
+ maxDepth: Optional[int] = None
264
+ maxDiscoveryDepth: Optional[int] = None
265
+ limit: Optional[int] = None
266
+ allowBackwardLinks: Optional[bool] = None
267
+ allowExternalLinks: Optional[bool] = None
268
+ ignoreSitemap: Optional[bool] = None
269
+ scrapeOptions: Optional[ScrapeOptions] = None
270
+ webhook: Optional[Union[str, WebhookConfig]] = None
271
+ deduplicateSimilarURLs: Optional[bool] = None
272
+ ignoreQueryParameters: Optional[bool] = None
273
+ regexOnFullURL: Optional[bool] = None
274
+ delay: Optional[int] = None # Delay in seconds between scrapes
275
+ maxConcurrency: Optional[int] = None
276
+ allowSubdomains: Optional[bool] = None
277
+
278
+ class CrawlResponse(pydantic.BaseModel):
279
+ """Response from crawling operations."""
280
+ id: Optional[str] = None
281
+ url: Optional[str] = None
282
+ success: bool = True
283
+ error: Optional[str] = None
284
+
285
+ class CrawlStatusResponse(pydantic.BaseModel):
286
+ """Response from crawl status checks."""
287
+ success: bool = True
288
+ status: Literal["scraping", "completed", "failed", "cancelled"]
289
+ completed: int
290
+ total: int
291
+ creditsUsed: int
292
+ expiresAt: datetime
293
+ next: Optional[str] = None
294
+ data: List[FirecrawlDocument]
295
+
296
+ class CrawlErrorsResponse(pydantic.BaseModel):
297
+ """Response from crawl/batch scrape error monitoring."""
298
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
299
+ robotsBlocked: List[str]
300
+
301
+ class MapParams(pydantic.BaseModel):
302
+ """Parameters for mapping operations."""
303
+ search: Optional[str] = None
304
+ ignoreSitemap: Optional[bool] = None
305
+ includeSubdomains: Optional[bool] = None
306
+ sitemapOnly: Optional[bool] = None
307
+ limit: Optional[int] = None
308
+ timeout: Optional[int] = None
309
+ useIndex: Optional[bool] = None
310
+
311
+ class MapResponse(pydantic.BaseModel):
312
+ """Response from mapping operations."""
313
+ success: bool = True
314
+ links: Optional[List[str]] = None
315
+ error: Optional[str] = None
316
+
317
+ class ExtractParams(pydantic.BaseModel):
318
+ """Parameters for extracting information from URLs."""
319
+ prompt: Optional[str] = None
320
+ schema: Optional[Any] = None
321
+ systemPrompt: Optional[str] = None
322
+ allowExternalLinks: Optional[bool] = None
323
+ enableWebSearch: Optional[bool] = None
324
+ includeSubdomains: Optional[bool] = None
325
+ origin: Optional[str] = None
326
+ showSources: Optional[bool] = None
327
+ scrapeOptions: Optional[ScrapeOptions] = None
328
+
329
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
330
+ """Response from extract operations."""
331
+ id: Optional[str] = None
332
+ status: Optional[Literal["processing", "completed", "failed"]] = None
333
+ expiresAt: Optional[datetime] = None
334
+ success: bool = True
335
+ data: Optional[T] = None
336
+ error: Optional[str] = None
337
+ warning: Optional[str] = None
338
+ sources: Optional[Dict[Any, Any]] = None
339
+
340
+ class SearchParams(pydantic.BaseModel):
341
+ query: str
342
+ limit: Optional[int] = 5
343
+ tbs: Optional[str] = None
344
+ filter: Optional[str] = None
345
+ lang: Optional[str] = "en"
346
+ country: Optional[str] = "us"
347
+ location: Optional[str] = None
348
+ origin: Optional[str] = "api"
349
+ timeout: Optional[int] = 60000
350
+ scrapeOptions: Optional[ScrapeOptions] = None
351
+
352
+ class SearchResponse(pydantic.BaseModel):
353
+ """Response from search operations."""
354
+ success: bool = True
355
+ data: List[FirecrawlDocument]
356
+ warning: Optional[str] = None
357
+ error: Optional[str] = None
358
+
359
+ class GenerateLLMsTextParams(pydantic.BaseModel):
360
+ """
361
+ Parameters for the LLMs.txt generation operation.
362
+ """
363
+ maxUrls: Optional[int] = 10
364
+ showFullText: Optional[bool] = False
365
+ cache: Optional[bool] = True
366
+ __experimental_stream: Optional[bool] = None
367
+
368
+ class DeepResearchParams(pydantic.BaseModel):
369
+ """
370
+ Parameters for the deep research operation.
371
+ """
372
+ maxDepth: Optional[int] = 7
373
+ timeLimit: Optional[int] = 270
374
+ maxUrls: Optional[int] = 20
375
+ analysisPrompt: Optional[str] = None
376
+ systemPrompt: Optional[str] = None
377
+ __experimental_streamSteps: Optional[bool] = None
378
+
379
+ class DeepResearchResponse(pydantic.BaseModel):
380
+ """
381
+ Response from the deep research operation.
382
+ """
383
+ success: bool
384
+ id: str
385
+ error: Optional[str] = None
386
+
387
+ class DeepResearchStatusResponse(pydantic.BaseModel):
388
+ """
389
+ Status response from the deep research operation.
390
+ """
391
+ success: bool
392
+ data: Optional[Dict[str, Any]] = None
393
+ status: str
394
+ error: Optional[str] = None
395
+ expiresAt: str
396
+ currentDepth: int
397
+ maxDepth: int
398
+ activities: List[Dict[str, Any]]
399
+ sources: List[Dict[str, Any]]
400
+ summaries: List[str]
401
+
402
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
403
+ """Response from LLMs.txt generation operations."""
404
+ success: bool = True
405
+ id: str
406
+ error: Optional[str] = None
407
+
408
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
409
+ llmstxt: str
410
+ llmsfulltxt: Optional[str] = None
411
+
412
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
413
+ """Status response from LLMs.txt generation operations."""
414
+ success: bool = True
415
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
416
+ status: Literal["processing", "completed", "failed"]
417
+ error: Optional[str] = None
418
+ expiresAt: str
419
+
420
+ class SearchResponse(pydantic.BaseModel):
421
+ """
422
+ Response from the search operation.
423
+ """
424
+ success: bool
425
+ data: List[Dict[str, Any]]
426
+ warning: Optional[str] = None
427
+ error: Optional[str] = None
428
+
429
+ class ExtractParams(pydantic.BaseModel):
430
+ """
431
+ Parameters for the extract operation.
432
+ """
433
+ prompt: Optional[str] = None
434
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
435
+ system_prompt: Optional[str] = None
436
+ allow_external_links: Optional[bool] = False
437
+ enable_web_search: Optional[bool] = False
438
+ # Just for backwards compatibility
439
+ enableWebSearch: Optional[bool] = False
440
+ show_sources: Optional[bool] = False
441
+ agent: Optional[Dict[str, Any]] = None
442
+
443
+ class FirecrawlApp:
444
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
445
+ """
446
+ Initialize the FirecrawlApp instance with API key, API URL.
447
+
448
+ Args:
449
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
450
+ api_url (Optional[str]): Base URL for the Firecrawl API.
451
+ """
452
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
453
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
454
+
455
+ # Only require API key when using cloud service
456
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
457
+ logger.warning("No API key provided for cloud service")
458
+ raise ValueError('No API key provided')
459
+
460
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
461
+
462
+ def scrape_url(
463
+ self,
464
+ url: str,
465
+ *,
466
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
467
+ headers: Optional[Dict[str, str]] = None,
468
+ include_tags: Optional[List[str]] = None,
469
+ exclude_tags: Optional[List[str]] = None,
470
+ only_main_content: Optional[bool] = None,
471
+ wait_for: Optional[int] = None,
472
+ timeout: Optional[int] = None,
473
+ location: Optional[LocationConfig] = None,
474
+ mobile: Optional[bool] = None,
475
+ skip_tls_verification: Optional[bool] = None,
476
+ remove_base64_images: Optional[bool] = None,
477
+ block_ads: Optional[bool] = None,
478
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
479
+ parse_pdf: Optional[bool] = None,
480
+ extract: Optional[JsonConfig] = None,
481
+ json_options: Optional[JsonConfig] = None,
482
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
483
+ change_tracking_options: Optional[ChangeTrackingOptions] = None,
484
+ max_age: Optional[int] = None,
485
+ store_in_cache: Optional[bool] = None,
486
+ zero_data_retention: Optional[bool] = None,
487
+ **kwargs) -> ScrapeResponse[Any]:
488
+ """
489
+ Scrape and extract content from a URL.
490
+
491
+ Args:
492
+ url (str): Target URL to scrape
493
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
494
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
495
+ include_tags (Optional[List[str]]): HTML tags to include
496
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
497
+ only_main_content (Optional[bool]): Extract main content only
498
+ wait_for (Optional[int]): Wait for a specific element to appear
499
+ timeout (Optional[int]): Request timeout (ms)
500
+ location (Optional[LocationConfig]): Location configuration
501
+ mobile (Optional[bool]): Use mobile user agent
502
+ skip_tls_verification (Optional[bool]): Skip TLS verification
503
+ remove_base64_images (Optional[bool]): Remove base64 images
504
+ block_ads (Optional[bool]): Block ads
505
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
506
+ extract (Optional[JsonConfig]): Content extraction settings
507
+ json_options (Optional[JsonConfig]): JSON extraction settings
508
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
509
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
510
+ zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
511
+
512
+
513
+ Returns:
514
+ ScrapeResponse with:
515
+ * Requested content formats
516
+ * Page metadata
517
+ * Extraction results
518
+ * Success/error status
519
+
520
+ Raises:
521
+ Exception: If scraping fails
522
+ """
523
+ # Validate any additional kwargs
524
+ self._validate_kwargs(kwargs, "scrape_url")
525
+
526
+ _headers = self._prepare_headers()
527
+
528
+ # Build scrape parameters
529
+ scrape_params = {
530
+ 'url': url,
531
+ 'origin': f"python-sdk@{version}"
532
+ }
533
+
534
+ # Add optional parameters if provided
535
+ if formats:
536
+ scrape_params['formats'] = formats
537
+ if headers:
538
+ scrape_params['headers'] = headers
539
+ if include_tags:
540
+ scrape_params['includeTags'] = include_tags
541
+ if exclude_tags:
542
+ scrape_params['excludeTags'] = exclude_tags
543
+ if only_main_content is not None:
544
+ scrape_params['onlyMainContent'] = only_main_content
545
+ if wait_for:
546
+ scrape_params['waitFor'] = wait_for
547
+ if timeout:
548
+ scrape_params['timeout'] = timeout
549
+ if location:
550
+ scrape_params['location'] = location.dict(exclude_none=True)
551
+ if mobile is not None:
552
+ scrape_params['mobile'] = mobile
553
+ if skip_tls_verification is not None:
554
+ scrape_params['skipTlsVerification'] = skip_tls_verification
555
+ if remove_base64_images is not None:
556
+ scrape_params['removeBase64Images'] = remove_base64_images
557
+ if block_ads is not None:
558
+ scrape_params['blockAds'] = block_ads
559
+ if proxy:
560
+ scrape_params['proxy'] = proxy
561
+ if parse_pdf is not None:
562
+ scrape_params['parsePDF'] = parse_pdf
563
+ if extract is not None:
564
+ extract = self._ensure_schema_dict(extract)
565
+ if isinstance(extract, dict) and "schema" in extract:
566
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
567
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
568
+ if json_options is not None:
569
+ json_options = self._ensure_schema_dict(json_options)
570
+ if isinstance(json_options, dict) and "schema" in json_options:
571
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
572
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
573
+ if actions:
574
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
575
+ if change_tracking_options:
576
+ scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
577
+ if max_age is not None:
578
+ scrape_params['maxAge'] = max_age
579
+ if store_in_cache is not None:
580
+ scrape_params['storeInCache'] = store_in_cache
581
+ if zero_data_retention is not None:
582
+ scrape_params['zeroDataRetention'] = zero_data_retention
583
+
584
+ scrape_params.update(kwargs)
585
+
586
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
587
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
588
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
589
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
590
+
591
+ # Make request
592
+ response = requests.post(
593
+ f'{self.api_url}/v1/scrape',
594
+ headers=_headers,
595
+ json=scrape_params,
596
+ timeout=(timeout + 5000 if timeout else None)
597
+ )
598
+
599
+ if response.status_code == 200:
600
+ try:
601
+ response_json = response.json()
602
+ if response_json.get('success') and 'data' in response_json:
603
+ return ScrapeResponse(**response_json['data'])
604
+ elif "error" in response_json:
605
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
606
+ else:
607
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
608
+ except ValueError:
609
+ raise Exception('Failed to parse Firecrawl response as JSON.')
610
+ else:
611
+ self._handle_error(response, 'scrape URL')
612
+
613
+ def search(
614
+ self,
615
+ query: str,
616
+ *,
617
+ limit: Optional[int] = None,
618
+ tbs: Optional[str] = None,
619
+ filter: Optional[str] = None,
620
+ lang: Optional[str] = None,
621
+ country: Optional[str] = None,
622
+ location: Optional[str] = None,
623
+ timeout: Optional[int] = None,
624
+ scrape_options: Optional[ScrapeOptions] = None,
625
+ **kwargs) -> SearchResponse:
626
+ """
627
+ Search for content using Firecrawl.
628
+
629
+ Args:
630
+ query (str): Search query string
631
+ limit (Optional[int]): Max results (default: 5)
632
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
633
+ filter (Optional[str]): Custom result filter
634
+ lang (Optional[str]): Language code (default: "en")
635
+ country (Optional[str]): Country code (default: "us")
636
+ location (Optional[str]): Geo-targeting
637
+ timeout (Optional[int]): Request timeout in milliseconds
638
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
639
+ **kwargs: Additional keyword arguments for future compatibility
640
+
641
+ Returns:
642
+ SearchResponse: Response containing:
643
+ * success (bool): Whether request succeeded
644
+ * data (List[FirecrawlDocument]): Search results
645
+ * warning (Optional[str]): Warning message if any
646
+ * error (Optional[str]): Error message if any
647
+
648
+ Raises:
649
+ Exception: If search fails or response cannot be parsed
650
+ """
651
+ # Validate any additional kwargs
652
+ self._validate_kwargs(kwargs, "search")
653
+
654
+ # Build search parameters
655
+ search_params = {}
656
+
657
+ # Add individual parameters
658
+ if limit is not None:
659
+ search_params['limit'] = limit
660
+ if tbs is not None:
661
+ search_params['tbs'] = tbs
662
+ if filter is not None:
663
+ search_params['filter'] = filter
664
+ if lang is not None:
665
+ search_params['lang'] = lang
666
+ if country is not None:
667
+ search_params['country'] = country
668
+ if location is not None:
669
+ search_params['location'] = location
670
+ if timeout is not None:
671
+ search_params['timeout'] = timeout
672
+ if scrape_options is not None:
673
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
674
+
675
+ # Add any additional kwargs
676
+ search_params.update(kwargs)
677
+ _integration = search_params.get('integration')
678
+
679
+ # Create final params object
680
+ final_params = SearchParams(query=query, **search_params)
681
+ params_dict = final_params.dict(exclude_none=True)
682
+ params_dict['origin'] = f"python-sdk@{version}"
683
+
684
+ if _integration:
685
+ params_dict['integration'] = _integration
686
+
687
+ # Make request
688
+ response = requests.post(
689
+ f"{self.api_url}/v1/search",
690
+ headers={"Authorization": f"Bearer {self.api_key}"},
691
+ json=params_dict
692
+ )
693
+
694
+ if response.status_code == 200:
695
+ try:
696
+ response_json = response.json()
697
+ if response_json.get('success') and 'data' in response_json:
698
+ return SearchResponse(**response_json)
699
+ elif "error" in response_json:
700
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
701
+ else:
702
+ raise Exception(f'Search failed. Error: {response_json}')
703
+ except ValueError:
704
+ raise Exception('Failed to parse Firecrawl response as JSON.')
705
+ else:
706
+ self._handle_error(response, 'search')
707
+
708
+ def crawl_url(
709
+ self,
710
+ url: str,
711
+ *,
712
+ include_paths: Optional[List[str]] = None,
713
+ exclude_paths: Optional[List[str]] = None,
714
+ max_depth: Optional[int] = None,
715
+ max_discovery_depth: Optional[int] = None,
716
+ limit: Optional[int] = None,
717
+ allow_backward_links: Optional[bool] = None,
718
+ crawl_entire_domain: Optional[bool] = None,
719
+ allow_external_links: Optional[bool] = None,
720
+ ignore_sitemap: Optional[bool] = None,
721
+ scrape_options: Optional[ScrapeOptions] = None,
722
+ webhook: Optional[Union[str, WebhookConfig]] = None,
723
+ deduplicate_similar_urls: Optional[bool] = None,
724
+ ignore_query_parameters: Optional[bool] = None,
725
+ regex_on_full_url: Optional[bool] = None,
726
+ delay: Optional[int] = None,
727
+ allow_subdomains: Optional[bool] = None,
728
+ max_concurrency: Optional[int] = None,
729
+ zero_data_retention: Optional[bool] = None,
730
+ poll_interval: Optional[int] = 2,
731
+ idempotency_key: Optional[str] = None,
732
+ **kwargs
733
+ ) -> CrawlStatusResponse:
734
+ """
735
+ Crawl a website starting from a URL.
736
+
737
+ Args:
738
+ url (str): Target URL to start crawling from
739
+ include_paths (Optional[List[str]]): Patterns of URLs to include
740
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
741
+ max_depth (Optional[int]): Maximum crawl depth
742
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
743
+ limit (Optional[int]): Maximum pages to crawl
744
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
745
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
746
+ allow_external_links (Optional[bool]): Follow external domain links
747
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
748
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
749
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
750
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
751
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
752
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
753
+ delay (Optional[int]): Delay in seconds between scrapes
754
+ allow_subdomains (Optional[bool]): Follow subdomains
755
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
756
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
757
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
758
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
759
+ **kwargs: Additional parameters to pass to the API
760
+
761
+ Returns:
762
+ CrawlStatusResponse with:
763
+ * Crawling status and progress
764
+ * Crawled page contents
765
+ * Success/error information
766
+
767
+ Raises:
768
+ Exception: If crawl fails
769
+ """
770
+ # Validate any additional kwargs
771
+ self._validate_kwargs(kwargs, "crawl_url")
772
+
773
+ crawl_params = {}
774
+
775
+ # Add individual parameters
776
+ if include_paths is not None:
777
+ crawl_params['includePaths'] = include_paths
778
+ if exclude_paths is not None:
779
+ crawl_params['excludePaths'] = exclude_paths
780
+ if max_depth is not None:
781
+ crawl_params['maxDepth'] = max_depth
782
+ if max_discovery_depth is not None:
783
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
784
+ if limit is not None:
785
+ crawl_params['limit'] = limit
786
+ if crawl_entire_domain is not None:
787
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
788
+ elif allow_backward_links is not None:
789
+ crawl_params['allowBackwardLinks'] = allow_backward_links
790
+ if allow_external_links is not None:
791
+ crawl_params['allowExternalLinks'] = allow_external_links
792
+ if ignore_sitemap is not None:
793
+ crawl_params['ignoreSitemap'] = ignore_sitemap
794
+ if scrape_options is not None:
795
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
796
+ if webhook is not None:
797
+ crawl_params['webhook'] = webhook
798
+ if deduplicate_similar_urls is not None:
799
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
800
+ if ignore_query_parameters is not None:
801
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
802
+ if regex_on_full_url is not None:
803
+ crawl_params['regexOnFullURL'] = regex_on_full_url
804
+ if delay is not None:
805
+ crawl_params['delay'] = delay
806
+ if allow_subdomains is not None:
807
+ crawl_params['allowSubdomains'] = allow_subdomains
808
+ if max_concurrency is not None:
809
+ crawl_params['maxConcurrency'] = max_concurrency
810
+ if zero_data_retention is not None:
811
+ crawl_params['zeroDataRetention'] = zero_data_retention
812
+ # Add any additional kwargs
813
+ crawl_params.update(kwargs)
814
+ _integration = crawl_params.get('integration')
815
+
816
+ # Create final params object
817
+ final_params = CrawlParams(**crawl_params)
818
+ params_dict = final_params.dict(exclude_none=True)
819
+ params_dict['url'] = url
820
+ params_dict['origin'] = f"python-sdk@{version}"
821
+
822
+ if _integration:
823
+ params_dict['integration'] = _integration
824
+
825
+ # Make request
826
+ headers = self._prepare_headers(idempotency_key)
827
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
828
+
829
+ if response.status_code == 200:
830
+ try:
831
+ id = response.json().get('id')
832
+ except:
833
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
834
+ return self._monitor_job_status(id, headers, poll_interval)
835
+ else:
836
+ self._handle_error(response, 'start crawl job')
837
+
838
+ def async_crawl_url(
839
+ self,
840
+ url: str,
841
+ *,
842
+ include_paths: Optional[List[str]] = None,
843
+ exclude_paths: Optional[List[str]] = None,
844
+ max_depth: Optional[int] = None,
845
+ max_discovery_depth: Optional[int] = None,
846
+ limit: Optional[int] = None,
847
+ allow_backward_links: Optional[bool] = None,
848
+ crawl_entire_domain: Optional[bool] = None,
849
+ allow_external_links: Optional[bool] = None,
850
+ ignore_sitemap: Optional[bool] = None,
851
+ scrape_options: Optional[ScrapeOptions] = None,
852
+ webhook: Optional[Union[str, WebhookConfig]] = None,
853
+ deduplicate_similar_urls: Optional[bool] = None,
854
+ ignore_query_parameters: Optional[bool] = None,
855
+ regex_on_full_url: Optional[bool] = None,
856
+ delay: Optional[int] = None,
857
+ allow_subdomains: Optional[bool] = None,
858
+ max_concurrency: Optional[int] = None,
859
+ zero_data_retention: Optional[bool] = None,
860
+ idempotency_key: Optional[str] = None,
861
+ **kwargs
862
+ ) -> CrawlResponse:
863
+ """
864
+ Start an asynchronous crawl job.
865
+
866
+ Args:
867
+ url (str): Target URL to start crawling from
868
+ include_paths (Optional[List[str]]): Patterns of URLs to include
869
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
870
+ max_depth (Optional[int]): Maximum crawl depth
871
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
872
+ limit (Optional[int]): Maximum pages to crawl
873
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
874
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
875
+ allow_external_links (Optional[bool]): Follow external domain links
876
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
877
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
878
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
879
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
880
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
881
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
882
+ delay (Optional[int]): Delay in seconds between scrapes
883
+ allow_subdomains (Optional[bool]): Follow subdomains
884
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
885
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
886
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
887
+ **kwargs: Additional parameters to pass to the API
888
+
889
+ Returns:
890
+ CrawlResponse with:
891
+ * success - Whether crawl started successfully
892
+ * id - Unique identifier for the crawl job
893
+ * url - Status check URL for the crawl
894
+ * error - Error message if start failed
895
+
896
+ Raises:
897
+ Exception: If crawl initiation fails
898
+ """
899
+ # Validate any additional kwargs
900
+ self._validate_kwargs(kwargs, "async_crawl_url")
901
+
902
+ crawl_params = {}
903
+
904
+ # Add individual parameters
905
+ if include_paths is not None:
906
+ crawl_params['includePaths'] = include_paths
907
+ if exclude_paths is not None:
908
+ crawl_params['excludePaths'] = exclude_paths
909
+ if max_depth is not None:
910
+ crawl_params['maxDepth'] = max_depth
911
+ if max_discovery_depth is not None:
912
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
913
+ if limit is not None:
914
+ crawl_params['limit'] = limit
915
+ if crawl_entire_domain is not None:
916
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
917
+ elif allow_backward_links is not None:
918
+ crawl_params['allowBackwardLinks'] = allow_backward_links
919
+ if allow_external_links is not None:
920
+ crawl_params['allowExternalLinks'] = allow_external_links
921
+ if ignore_sitemap is not None:
922
+ crawl_params['ignoreSitemap'] = ignore_sitemap
923
+ if scrape_options is not None:
924
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
925
+ if webhook is not None:
926
+ crawl_params['webhook'] = webhook
927
+ if deduplicate_similar_urls is not None:
928
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
929
+ if ignore_query_parameters is not None:
930
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
931
+ if regex_on_full_url is not None:
932
+ crawl_params['regexOnFullURL'] = regex_on_full_url
933
+ if delay is not None:
934
+ crawl_params['delay'] = delay
935
+ if allow_subdomains is not None:
936
+ crawl_params['allowSubdomains'] = allow_subdomains
937
+ if max_concurrency is not None:
938
+ crawl_params['maxConcurrency'] = max_concurrency
939
+ if zero_data_retention is not None:
940
+ crawl_params['zeroDataRetention'] = zero_data_retention
941
+ # Add any additional kwargs
942
+ crawl_params.update(kwargs)
943
+
944
+ # Create final params object
945
+ final_params = CrawlParams(**crawl_params)
946
+ params_dict = final_params.dict(exclude_none=True)
947
+ params_dict['url'] = url
948
+ params_dict['origin'] = f"python-sdk@{version}"
949
+
950
+ # Make request
951
+ headers = self._prepare_headers(idempotency_key)
952
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
953
+
954
+ if response.status_code == 200:
955
+ try:
956
+ return CrawlResponse(**response.json())
957
+ except:
958
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
959
+ else:
960
+ self._handle_error(response, 'start crawl job')
961
+
962
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
963
+ """
964
+ Check the status and results of a crawl job.
965
+
966
+ Args:
967
+ id: Unique identifier for the crawl job
968
+
969
+ Returns:
970
+ CrawlStatusResponse containing:
971
+
972
+ Status Information:
973
+ * status - Current state (scraping/completed/failed/cancelled)
974
+ * completed - Number of pages crawled
975
+ * total - Total pages to crawl
976
+ * creditsUsed - API credits consumed
977
+ * expiresAt - Data expiration timestamp
978
+
979
+ Results:
980
+ * data - List of crawled documents
981
+ * next - URL for next page of results (if paginated)
982
+ * success - Whether status check succeeded
983
+ * error - Error message if failed
984
+
985
+ Raises:
986
+ Exception: If status check fails
987
+ """
988
+ endpoint = f'/v1/crawl/{id}'
989
+
990
+ headers = self._prepare_headers()
991
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
992
+ if response.status_code == 200:
993
+ try:
994
+ status_data = response.json()
995
+ except:
996
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
997
+ if status_data['status'] == 'completed':
998
+ if 'data' in status_data:
999
+ data = status_data['data']
1000
+ while 'next' in status_data:
1001
+ if len(status_data['data']) == 0:
1002
+ break
1003
+ next_url = status_data.get('next')
1004
+ if not next_url:
1005
+ logger.warning("Expected 'next' URL is missing.")
1006
+ break
1007
+ try:
1008
+ status_response = self._get_request(next_url, headers)
1009
+ if status_response.status_code != 200:
1010
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1011
+ break
1012
+ try:
1013
+ next_data = status_response.json()
1014
+ except:
1015
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1016
+ data.extend(next_data.get('data', []))
1017
+ status_data = next_data
1018
+ except Exception as e:
1019
+ logger.error(f"Error during pagination request: {e}")
1020
+ break
1021
+ status_data['data'] = data
1022
+
1023
+ response = {
1024
+ 'status': status_data.get('status'),
1025
+ 'total': status_data.get('total'),
1026
+ 'completed': status_data.get('completed'),
1027
+ 'creditsUsed': status_data.get('creditsUsed'),
1028
+ 'expiresAt': status_data.get('expiresAt'),
1029
+ 'data': status_data.get('data')
1030
+ }
1031
+
1032
+ if 'error' in status_data:
1033
+ response['error'] = status_data['error']
1034
+
1035
+ if 'next' in status_data:
1036
+ response['next'] = status_data['next']
1037
+
1038
+ return CrawlStatusResponse(
1039
+ success=False if 'error' in status_data else True,
1040
+ **response
1041
+ )
1042
+ else:
1043
+ self._handle_error(response, 'check crawl status')
1044
+
1045
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
1046
+ """
1047
+ Returns information about crawl errors.
1048
+
1049
+ Args:
1050
+ id (str): The ID of the crawl job
1051
+
1052
+ Returns:
1053
+ CrawlErrorsResponse containing:
1054
+ * errors (List[Dict[str, str]]): List of errors with fields:
1055
+ - id (str): Error ID
1056
+ - timestamp (str): When the error occurred
1057
+ - url (str): URL that caused the error
1058
+ - error (str): Error message
1059
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1060
+
1061
+ Raises:
1062
+ Exception: If error check fails
1063
+ """
1064
+ headers = self._prepare_headers()
1065
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1066
+ if response.status_code == 200:
1067
+ try:
1068
+ return CrawlErrorsResponse(**response.json())
1069
+ except:
1070
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1071
+ else:
1072
+ self._handle_error(response, "check crawl errors")
1073
+
1074
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
1075
+ """
1076
+ Cancel an asynchronous crawl job.
1077
+
1078
+ Args:
1079
+ id (str): The ID of the crawl job to cancel
1080
+
1081
+ Returns:
1082
+ Dict[str, Any] containing:
1083
+ * success (bool): Whether cancellation was successful
1084
+ * error (str, optional): Error message if cancellation failed
1085
+
1086
+ Raises:
1087
+ Exception: If cancellation fails
1088
+ """
1089
+ headers = self._prepare_headers()
1090
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1091
+ if response.status_code == 200:
1092
+ try:
1093
+ return response.json()
1094
+ except:
1095
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1096
+ else:
1097
+ self._handle_error(response, "cancel crawl job")
1098
+
1099
+ def crawl_url_and_watch(
1100
+ self,
1101
+ url: str,
1102
+ *,
1103
+ include_paths: Optional[List[str]] = None,
1104
+ exclude_paths: Optional[List[str]] = None,
1105
+ max_depth: Optional[int] = None,
1106
+ max_discovery_depth: Optional[int] = None,
1107
+ limit: Optional[int] = None,
1108
+ allow_backward_links: Optional[bool] = None,
1109
+ crawl_entire_domain: Optional[bool] = None,
1110
+ allow_external_links: Optional[bool] = None,
1111
+ ignore_sitemap: Optional[bool] = None,
1112
+ scrape_options: Optional[ScrapeOptions] = None,
1113
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1114
+ deduplicate_similar_urls: Optional[bool] = None,
1115
+ ignore_query_parameters: Optional[bool] = None,
1116
+ regex_on_full_url: Optional[bool] = None,
1117
+ delay: Optional[int] = None,
1118
+ allow_subdomains: Optional[bool] = None,
1119
+ max_concurrency: Optional[int] = None,
1120
+ zero_data_retention: Optional[bool] = None,
1121
+ idempotency_key: Optional[str] = None,
1122
+ **kwargs
1123
+ ) -> 'CrawlWatcher':
1124
+ """
1125
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1126
+
1127
+ Args:
1128
+ url (str): Target URL to start crawling from
1129
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1130
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1131
+ max_depth (Optional[int]): Maximum crawl depth
1132
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1133
+ limit (Optional[int]): Maximum pages to crawl
1134
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1135
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
1136
+ allow_external_links (Optional[bool]): Follow external domain links
1137
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1138
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1139
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1140
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1141
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1142
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1143
+ delay (Optional[int]): Delay in seconds between scrapes
1144
+ allow_subdomains (Optional[bool]): Follow subdomains
1145
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1146
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1147
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1148
+ **kwargs: Additional parameters to pass to the API
1149
+
1150
+ Returns:
1151
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1152
+
1153
+ Raises:
1154
+ Exception: If crawl job fails to start
1155
+ """
1156
+ crawl_response = self.async_crawl_url(
1157
+ url,
1158
+ include_paths=include_paths,
1159
+ exclude_paths=exclude_paths,
1160
+ max_depth=max_depth,
1161
+ max_discovery_depth=max_discovery_depth,
1162
+ limit=limit,
1163
+ allow_backward_links=allow_backward_links,
1164
+ allow_external_links=allow_external_links,
1165
+ ignore_sitemap=ignore_sitemap,
1166
+ scrape_options=scrape_options,
1167
+ webhook=webhook,
1168
+ deduplicate_similar_urls=deduplicate_similar_urls,
1169
+ ignore_query_parameters=ignore_query_parameters,
1170
+ regex_on_full_url=regex_on_full_url,
1171
+ delay=delay,
1172
+ allow_subdomains=allow_subdomains,
1173
+ max_concurrency=max_concurrency,
1174
+ zero_data_retention=zero_data_retention,
1175
+ idempotency_key=idempotency_key,
1176
+ **kwargs
1177
+ )
1178
+ if crawl_response.success and crawl_response.id:
1179
+ return CrawlWatcher(crawl_response.id, self)
1180
+ else:
1181
+ raise Exception("Crawl job failed to start")
1182
+
1183
+ def map_url(
1184
+ self,
1185
+ url: str,
1186
+ *,
1187
+ search: Optional[str] = None,
1188
+ ignore_sitemap: Optional[bool] = None,
1189
+ include_subdomains: Optional[bool] = None,
1190
+ sitemap_only: Optional[bool] = None,
1191
+ limit: Optional[int] = None,
1192
+ timeout: Optional[int] = None,
1193
+ use_index: Optional[bool] = None,
1194
+ **kwargs) -> MapResponse:
1195
+ """
1196
+ Map and discover links from a URL.
1197
+
1198
+ Args:
1199
+ url (str): Target URL to map
1200
+ search (Optional[str]): Filter pattern for URLs
1201
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1202
+ include_subdomains (Optional[bool]): Include subdomain links
1203
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1204
+ limit (Optional[int]): Maximum URLs to return
1205
+ timeout (Optional[int]): Request timeout in milliseconds
1206
+ **kwargs: Additional parameters to pass to the API
1207
+
1208
+ Returns:
1209
+ MapResponse: Response containing:
1210
+ * success (bool): Whether request succeeded
1211
+ * links (List[str]): Discovered URLs
1212
+ * error (Optional[str]): Error message if any
1213
+
1214
+ Raises:
1215
+ Exception: If mapping fails or response cannot be parsed
1216
+ """
1217
+ # Validate any additional kwargs
1218
+ self._validate_kwargs(kwargs, "map_url")
1219
+
1220
+ # Build map parameters
1221
+ map_params = {}
1222
+
1223
+ # Add individual parameters
1224
+ if search is not None:
1225
+ map_params['search'] = search
1226
+ if ignore_sitemap is not None:
1227
+ map_params['ignoreSitemap'] = ignore_sitemap
1228
+ if include_subdomains is not None:
1229
+ map_params['includeSubdomains'] = include_subdomains
1230
+ if sitemap_only is not None:
1231
+ map_params['sitemapOnly'] = sitemap_only
1232
+ if limit is not None:
1233
+ map_params['limit'] = limit
1234
+ if timeout is not None:
1235
+ map_params['timeout'] = timeout
1236
+ if use_index is not None:
1237
+ map_params['useIndex'] = use_index
1238
+
1239
+ # Add any additional kwargs
1240
+ map_params.update(kwargs)
1241
+ _integration = map_params.get('integration')
1242
+
1243
+ # Create final params object
1244
+ final_params = MapParams(**map_params)
1245
+ params_dict = final_params.dict(exclude_none=True)
1246
+ params_dict['url'] = url
1247
+ params_dict['origin'] = f"python-sdk@{version}"
1248
+
1249
+ if _integration:
1250
+ params_dict['integration'] = _integration
1251
+
1252
+ # Make request
1253
+ response = requests.post(
1254
+ f"{self.api_url}/v1/map",
1255
+ headers={"Authorization": f"Bearer {self.api_key}"},
1256
+ json=params_dict
1257
+ )
1258
+
1259
+ if response.status_code == 200:
1260
+ try:
1261
+ response_json = response.json()
1262
+ if response_json.get('success') and 'links' in response_json:
1263
+ return MapResponse(**response_json)
1264
+ elif "error" in response_json:
1265
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1266
+ else:
1267
+ raise Exception(f'Map failed. Error: {response_json}')
1268
+ except ValueError:
1269
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1270
+ else:
1271
+ self._handle_error(response, 'map')
1272
+
1273
+ def batch_scrape_urls(
1274
+ self,
1275
+ urls: List[str],
1276
+ *,
1277
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1278
+ headers: Optional[Dict[str, str]] = None,
1279
+ include_tags: Optional[List[str]] = None,
1280
+ exclude_tags: Optional[List[str]] = None,
1281
+ only_main_content: Optional[bool] = None,
1282
+ wait_for: Optional[int] = None,
1283
+ timeout: Optional[int] = None,
1284
+ location: Optional[LocationConfig] = None,
1285
+ mobile: Optional[bool] = None,
1286
+ skip_tls_verification: Optional[bool] = None,
1287
+ remove_base64_images: Optional[bool] = None,
1288
+ block_ads: Optional[bool] = None,
1289
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1290
+ extract: Optional[JsonConfig] = None,
1291
+ json_options: Optional[JsonConfig] = None,
1292
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1293
+ agent: Optional[AgentOptions] = None,
1294
+ poll_interval: Optional[int] = 2,
1295
+ max_concurrency: Optional[int] = None,
1296
+ zero_data_retention: Optional[bool] = None,
1297
+ idempotency_key: Optional[str] = None,
1298
+ **kwargs
1299
+ ) -> BatchScrapeStatusResponse:
1300
+ """
1301
+ Batch scrape multiple URLs and monitor until completion.
1302
+
1303
+ Args:
1304
+ urls (List[str]): URLs to scrape
1305
+ formats (Optional[List[Literal]]): Content formats to retrieve
1306
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1307
+ include_tags (Optional[List[str]]): HTML tags to include
1308
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1309
+ only_main_content (Optional[bool]): Extract main content only
1310
+ wait_for (Optional[int]): Wait time in milliseconds
1311
+ timeout (Optional[int]): Request timeout in milliseconds
1312
+ location (Optional[LocationConfig]): Location configuration
1313
+ mobile (Optional[bool]): Use mobile user agent
1314
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1315
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1316
+ block_ads (Optional[bool]): Block advertisements
1317
+ proxy (Optional[Literal]): Proxy type to use
1318
+ extract (Optional[JsonConfig]): Content extraction config
1319
+ json_options (Optional[JsonConfig]): JSON extraction config
1320
+ actions (Optional[List[Union]]): Actions to perform
1321
+ agent (Optional[AgentOptions]): Agent configuration
1322
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1323
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1324
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1325
+ **kwargs: Additional parameters to pass to the API
1326
+
1327
+ Returns:
1328
+ BatchScrapeStatusResponse with:
1329
+ * Scraping status and progress
1330
+ * Scraped content for each URL
1331
+ * Success/error information
1332
+
1333
+ Raises:
1334
+ Exception: If batch scrape fails
1335
+ """
1336
+ # Validate any additional kwargs
1337
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1338
+
1339
+ scrape_params = {}
1340
+
1341
+ # Add individual parameters
1342
+ if formats is not None:
1343
+ scrape_params['formats'] = formats
1344
+ if headers is not None:
1345
+ scrape_params['headers'] = headers
1346
+ if include_tags is not None:
1347
+ scrape_params['includeTags'] = include_tags
1348
+ if exclude_tags is not None:
1349
+ scrape_params['excludeTags'] = exclude_tags
1350
+ if only_main_content is not None:
1351
+ scrape_params['onlyMainContent'] = only_main_content
1352
+ if wait_for is not None:
1353
+ scrape_params['waitFor'] = wait_for
1354
+ if timeout is not None:
1355
+ scrape_params['timeout'] = timeout
1356
+ if location is not None:
1357
+ scrape_params['location'] = location.dict(exclude_none=True)
1358
+ if mobile is not None:
1359
+ scrape_params['mobile'] = mobile
1360
+ if skip_tls_verification is not None:
1361
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1362
+ if remove_base64_images is not None:
1363
+ scrape_params['removeBase64Images'] = remove_base64_images
1364
+ if block_ads is not None:
1365
+ scrape_params['blockAds'] = block_ads
1366
+ if proxy is not None:
1367
+ scrape_params['proxy'] = proxy
1368
+ if extract is not None:
1369
+ extract = self._ensure_schema_dict(extract)
1370
+ if isinstance(extract, dict) and "schema" in extract:
1371
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1372
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1373
+ if json_options is not None:
1374
+ json_options = self._ensure_schema_dict(json_options)
1375
+ if isinstance(json_options, dict) and "schema" in json_options:
1376
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1377
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1378
+ if actions:
1379
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
1380
+ if agent is not None:
1381
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1382
+ if max_concurrency is not None:
1383
+ scrape_params['maxConcurrency'] = max_concurrency
1384
+ if zero_data_retention is not None:
1385
+ scrape_params['zeroDataRetention'] = zero_data_retention
1386
+
1387
+ # Add any additional kwargs
1388
+ scrape_params.update(kwargs)
1389
+
1390
+ # Create final params object
1391
+ final_params = ScrapeParams(**scrape_params)
1392
+ params_dict = final_params.dict(exclude_none=True)
1393
+ params_dict['urls'] = urls
1394
+ params_dict['origin'] = f"python-sdk@{version}"
1395
+
1396
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1397
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1398
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1399
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1400
+
1401
+ # Make request
1402
+ headers = self._prepare_headers(idempotency_key)
1403
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1404
+
1405
+ if response.status_code == 200:
1406
+ try:
1407
+ id = response.json().get('id')
1408
+ except:
1409
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1410
+ return self._monitor_job_status(id, headers, poll_interval)
1411
+ else:
1412
+ self._handle_error(response, 'start batch scrape job')
1413
+
1414
+ def async_batch_scrape_urls(
1415
+ self,
1416
+ urls: List[str],
1417
+ *,
1418
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1419
+ headers: Optional[Dict[str, str]] = None,
1420
+ include_tags: Optional[List[str]] = None,
1421
+ exclude_tags: Optional[List[str]] = None,
1422
+ only_main_content: Optional[bool] = None,
1423
+ wait_for: Optional[int] = None,
1424
+ timeout: Optional[int] = None,
1425
+ location: Optional[LocationConfig] = None,
1426
+ mobile: Optional[bool] = None,
1427
+ skip_tls_verification: Optional[bool] = None,
1428
+ remove_base64_images: Optional[bool] = None,
1429
+ block_ads: Optional[bool] = None,
1430
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1431
+ extract: Optional[JsonConfig] = None,
1432
+ json_options: Optional[JsonConfig] = None,
1433
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1434
+ agent: Optional[AgentOptions] = None,
1435
+ max_concurrency: Optional[int] = None,
1436
+ idempotency_key: Optional[str] = None,
1437
+ zero_data_retention: Optional[bool] = None,
1438
+ **kwargs
1439
+ ) -> BatchScrapeResponse:
1440
+ """
1441
+ Initiate a batch scrape job asynchronously.
1442
+
1443
+ Args:
1444
+ urls (List[str]): URLs to scrape
1445
+ formats (Optional[List[Literal]]): Content formats to retrieve
1446
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1447
+ include_tags (Optional[List[str]]): HTML tags to include
1448
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1449
+ only_main_content (Optional[bool]): Extract main content only
1450
+ wait_for (Optional[int]): Wait time in milliseconds
1451
+ timeout (Optional[int]): Request timeout in milliseconds
1452
+ location (Optional[LocationConfig]): Location configuration
1453
+ mobile (Optional[bool]): Use mobile user agent
1454
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1455
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1456
+ block_ads (Optional[bool]): Block advertisements
1457
+ proxy (Optional[Literal]): Proxy type to use
1458
+ extract (Optional[JsonConfig]): Content extraction config
1459
+ json_options (Optional[JsonConfig]): JSON extraction config
1460
+ actions (Optional[List[Union]]): Actions to perform
1461
+ agent (Optional[AgentOptions]): Agent configuration
1462
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1463
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1464
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1465
+ **kwargs: Additional parameters to pass to the API
1466
+
1467
+ Returns:
1468
+ BatchScrapeResponse with:
1469
+ * success - Whether job started successfully
1470
+ * id - Unique identifier for the job
1471
+ * url - Status check URL
1472
+ * error - Error message if start failed
1473
+
1474
+ Raises:
1475
+ Exception: If job initiation fails
1476
+ """
1477
+ # Validate any additional kwargs
1478
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1479
+
1480
+ scrape_params = {}
1481
+
1482
+ # Add individual parameters
1483
+ if formats is not None:
1484
+ scrape_params['formats'] = formats
1485
+ if headers is not None:
1486
+ scrape_params['headers'] = headers
1487
+ if include_tags is not None:
1488
+ scrape_params['includeTags'] = include_tags
1489
+ if exclude_tags is not None:
1490
+ scrape_params['excludeTags'] = exclude_tags
1491
+ if only_main_content is not None:
1492
+ scrape_params['onlyMainContent'] = only_main_content
1493
+ if wait_for is not None:
1494
+ scrape_params['waitFor'] = wait_for
1495
+ if timeout is not None:
1496
+ scrape_params['timeout'] = timeout
1497
+ if location is not None:
1498
+ scrape_params['location'] = location.dict(exclude_none=True)
1499
+ if mobile is not None:
1500
+ scrape_params['mobile'] = mobile
1501
+ if skip_tls_verification is not None:
1502
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1503
+ if remove_base64_images is not None:
1504
+ scrape_params['removeBase64Images'] = remove_base64_images
1505
+ if block_ads is not None:
1506
+ scrape_params['blockAds'] = block_ads
1507
+ if proxy is not None:
1508
+ scrape_params['proxy'] = proxy
1509
+ if extract is not None:
1510
+ extract = self._ensure_schema_dict(extract)
1511
+ if isinstance(extract, dict) and "schema" in extract:
1512
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1513
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1514
+ if json_options is not None:
1515
+ json_options = self._ensure_schema_dict(json_options)
1516
+ if isinstance(json_options, dict) and "schema" in json_options:
1517
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1518
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1519
+ if actions:
1520
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
1521
+ if agent is not None:
1522
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1523
+ if max_concurrency is not None:
1524
+ scrape_params['maxConcurrency'] = max_concurrency
1525
+ if zero_data_retention is not None:
1526
+ scrape_params['zeroDataRetention'] = zero_data_retention
1527
+
1528
+ # Add any additional kwargs
1529
+ scrape_params.update(kwargs)
1530
+
1531
+ # Create final params object
1532
+ final_params = ScrapeParams(**scrape_params)
1533
+ params_dict = final_params.dict(exclude_none=True)
1534
+ params_dict['urls'] = urls
1535
+ params_dict['origin'] = f"python-sdk@{version}"
1536
+
1537
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1538
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1539
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1540
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1541
+
1542
+ # Make request
1543
+ headers = self._prepare_headers(idempotency_key)
1544
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1545
+
1546
+ if response.status_code == 200:
1547
+ try:
1548
+ return BatchScrapeResponse(**response.json())
1549
+ except:
1550
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1551
+ else:
1552
+ self._handle_error(response, 'start batch scrape job')
1553
+
1554
+ def batch_scrape_urls_and_watch(
1555
+ self,
1556
+ urls: List[str],
1557
+ *,
1558
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1559
+ headers: Optional[Dict[str, str]] = None,
1560
+ include_tags: Optional[List[str]] = None,
1561
+ exclude_tags: Optional[List[str]] = None,
1562
+ only_main_content: Optional[bool] = None,
1563
+ wait_for: Optional[int] = None,
1564
+ timeout: Optional[int] = None,
1565
+ location: Optional[LocationConfig] = None,
1566
+ mobile: Optional[bool] = None,
1567
+ skip_tls_verification: Optional[bool] = None,
1568
+ remove_base64_images: Optional[bool] = None,
1569
+ block_ads: Optional[bool] = None,
1570
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1571
+ extract: Optional[JsonConfig] = None,
1572
+ json_options: Optional[JsonConfig] = None,
1573
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1574
+ agent: Optional[AgentOptions] = None,
1575
+ max_concurrency: Optional[int] = None,
1576
+ zero_data_retention: Optional[bool] = None,
1577
+ idempotency_key: Optional[str] = None,
1578
+ **kwargs
1579
+ ) -> 'CrawlWatcher':
1580
+ """
1581
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1582
+
1583
+ Args:
1584
+ urls (List[str]): URLs to scrape
1585
+ formats (Optional[List[Literal]]): Content formats to retrieve
1586
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1587
+ include_tags (Optional[List[str]]): HTML tags to include
1588
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1589
+ only_main_content (Optional[bool]): Extract main content only
1590
+ wait_for (Optional[int]): Wait time in milliseconds
1591
+ timeout (Optional[int]): Request timeout in milliseconds
1592
+ location (Optional[LocationConfig]): Location configuration
1593
+ mobile (Optional[bool]): Use mobile user agent
1594
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1595
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1596
+ block_ads (Optional[bool]): Block advertisements
1597
+ proxy (Optional[Literal]): Proxy type to use
1598
+ extract (Optional[JsonConfig]): Content extraction config
1599
+ json_options (Optional[JsonConfig]): JSON extraction config
1600
+ actions (Optional[List[Union]]): Actions to perform
1601
+ agent (Optional[AgentOptions]): Agent configuration
1602
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1603
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1604
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1605
+ **kwargs: Additional parameters to pass to the API
1606
+
1607
+ Returns:
1608
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1609
+
1610
+ Raises:
1611
+ Exception: If batch scrape job fails to start
1612
+ """
1613
+ # Validate any additional kwargs
1614
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1615
+
1616
+ scrape_params = {}
1617
+
1618
+ # Add individual parameters
1619
+ if formats is not None:
1620
+ scrape_params['formats'] = formats
1621
+ if headers is not None:
1622
+ scrape_params['headers'] = headers
1623
+ if include_tags is not None:
1624
+ scrape_params['includeTags'] = include_tags
1625
+ if exclude_tags is not None:
1626
+ scrape_params['excludeTags'] = exclude_tags
1627
+ if only_main_content is not None:
1628
+ scrape_params['onlyMainContent'] = only_main_content
1629
+ if wait_for is not None:
1630
+ scrape_params['waitFor'] = wait_for
1631
+ if timeout is not None:
1632
+ scrape_params['timeout'] = timeout
1633
+ if location is not None:
1634
+ scrape_params['location'] = location.dict(exclude_none=True)
1635
+ if mobile is not None:
1636
+ scrape_params['mobile'] = mobile
1637
+ if skip_tls_verification is not None:
1638
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1639
+ if remove_base64_images is not None:
1640
+ scrape_params['removeBase64Images'] = remove_base64_images
1641
+ if block_ads is not None:
1642
+ scrape_params['blockAds'] = block_ads
1643
+ if proxy is not None:
1644
+ scrape_params['proxy'] = proxy
1645
+ if extract is not None:
1646
+ extract = self._ensure_schema_dict(extract)
1647
+ if isinstance(extract, dict) and "schema" in extract:
1648
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1649
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1650
+ if json_options is not None:
1651
+ json_options = self._ensure_schema_dict(json_options)
1652
+ if isinstance(json_options, dict) and "schema" in json_options:
1653
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1654
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1655
+ if actions:
1656
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
1657
+ if agent is not None:
1658
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1659
+ if max_concurrency is not None:
1660
+ scrape_params['maxConcurrency'] = max_concurrency
1661
+ if zero_data_retention is not None:
1662
+ scrape_params['zeroDataRetention'] = zero_data_retention
1663
+
1664
+ # Add any additional kwargs
1665
+ scrape_params.update(kwargs)
1666
+
1667
+ # Create final params object
1668
+ final_params = ScrapeParams(**scrape_params)
1669
+ params_dict = final_params.dict(exclude_none=True)
1670
+ params_dict['urls'] = urls
1671
+ params_dict['origin'] = f"python-sdk@{version}"
1672
+
1673
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1674
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1675
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1676
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1677
+
1678
+ # Make request
1679
+ headers = self._prepare_headers(idempotency_key)
1680
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1681
+
1682
+ if response.status_code == 200:
1683
+ try:
1684
+ crawl_response = BatchScrapeResponse(**response.json())
1685
+ if crawl_response.success and crawl_response.id:
1686
+ return CrawlWatcher(crawl_response.id, self)
1687
+ else:
1688
+ raise Exception("Batch scrape job failed to start")
1689
+ except:
1690
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1691
+ else:
1692
+ self._handle_error(response, 'start batch scrape job')
1693
+
1694
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1695
+ """
1696
+ Check the status of a batch scrape job using the Firecrawl API.
1697
+
1698
+ Args:
1699
+ id (str): The ID of the batch scrape job.
1700
+
1701
+ Returns:
1702
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1703
+
1704
+ Raises:
1705
+ Exception: If the status check request fails.
1706
+ """
1707
+ endpoint = f'/v1/batch/scrape/{id}'
1708
+
1709
+ headers = self._prepare_headers()
1710
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1711
+ if response.status_code == 200:
1712
+ try:
1713
+ status_data = response.json()
1714
+ except:
1715
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1716
+ if status_data['status'] == 'completed':
1717
+ if 'data' in status_data:
1718
+ data = status_data['data']
1719
+ while 'next' in status_data:
1720
+ if len(status_data['data']) == 0:
1721
+ break
1722
+ next_url = status_data.get('next')
1723
+ if not next_url:
1724
+ logger.warning("Expected 'next' URL is missing.")
1725
+ break
1726
+ try:
1727
+ status_response = self._get_request(next_url, headers)
1728
+ if status_response.status_code != 200:
1729
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1730
+ break
1731
+ try:
1732
+ next_data = status_response.json()
1733
+ except:
1734
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1735
+ data.extend(next_data.get('data', []))
1736
+ status_data = next_data
1737
+ except Exception as e:
1738
+ logger.error(f"Error during pagination request: {e}")
1739
+ break
1740
+ status_data['data'] = data
1741
+
1742
+ return BatchScrapeStatusResponse(**{
1743
+ 'success': False if 'error' in status_data else True,
1744
+ 'status': status_data.get('status'),
1745
+ 'total': status_data.get('total'),
1746
+ 'completed': status_data.get('completed'),
1747
+ 'creditsUsed': status_data.get('creditsUsed'),
1748
+ 'expiresAt': status_data.get('expiresAt'),
1749
+ 'data': status_data.get('data'),
1750
+ 'next': status_data.get('next'),
1751
+ 'error': status_data.get('error')
1752
+ })
1753
+ else:
1754
+ self._handle_error(response, 'check batch scrape status')
1755
+
1756
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1757
+ """
1758
+ Returns information about batch scrape errors.
1759
+
1760
+ Args:
1761
+ id (str): The ID of the crawl job.
1762
+
1763
+ Returns:
1764
+ CrawlErrorsResponse containing:
1765
+ * errors (List[Dict[str, str]]): List of errors with fields:
1766
+ * id (str): Error ID
1767
+ * timestamp (str): When the error occurred
1768
+ * url (str): URL that caused the error
1769
+ * error (str): Error message
1770
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1771
+
1772
+ Raises:
1773
+ Exception: If the error check request fails
1774
+ """
1775
+ headers = self._prepare_headers()
1776
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1777
+ if response.status_code == 200:
1778
+ try:
1779
+ return CrawlErrorsResponse(**response.json())
1780
+ except:
1781
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1782
+ else:
1783
+ self._handle_error(response, "check batch scrape errors")
1784
+
1785
+ def extract(
1786
+ self,
1787
+ urls: Optional[List[str]] = None,
1788
+ *,
1789
+ prompt: Optional[str] = None,
1790
+ schema: Optional[Any] = None,
1791
+ system_prompt: Optional[str] = None,
1792
+ allow_external_links: Optional[bool] = False,
1793
+ enable_web_search: Optional[bool] = False,
1794
+ show_sources: Optional[bool] = False,
1795
+ agent: Optional[Dict[str, Any]] = None,
1796
+ **kwargs) -> ExtractResponse[Any]:
1797
+ """
1798
+ Extract structured information from URLs.
1799
+
1800
+ Args:
1801
+ urls (Optional[List[str]]): URLs to extract from
1802
+ prompt (Optional[str]): Custom extraction prompt
1803
+ schema (Optional[Any]): JSON schema/Pydantic model
1804
+ system_prompt (Optional[str]): System context
1805
+ allow_external_links (Optional[bool]): Follow external links
1806
+ enable_web_search (Optional[bool]): Enable web search
1807
+ show_sources (Optional[bool]): Include source URLs
1808
+ agent (Optional[Dict[str, Any]]): Agent configuration
1809
+ **kwargs: Additional parameters to pass to the API
1810
+
1811
+ Returns:
1812
+ ExtractResponse[Any] with:
1813
+ * success (bool): Whether request succeeded
1814
+ * data (Optional[Any]): Extracted data matching schema
1815
+ * error (Optional[str]): Error message if any
1816
+
1817
+ Raises:
1818
+ ValueError: If prompt/schema missing or extraction fails
1819
+ """
1820
+ # Validate any additional kwargs
1821
+ self._validate_kwargs(kwargs, "extract")
1822
+
1823
+ headers = self._prepare_headers()
1824
+
1825
+ if not prompt and not schema:
1826
+ raise ValueError("Either prompt or schema is required")
1827
+
1828
+ if not urls and not prompt:
1829
+ raise ValueError("Either urls or prompt is required")
1830
+
1831
+ if schema:
1832
+ schema = self._ensure_schema_dict(schema)
1833
+
1834
+ request_data = {
1835
+ 'urls': urls or [],
1836
+ 'allowExternalLinks': allow_external_links,
1837
+ 'enableWebSearch': enable_web_search,
1838
+ 'showSources': show_sources,
1839
+ 'schema': schema,
1840
+ 'origin': f'python-sdk@{get_version()}'
1841
+ }
1842
+
1843
+ # Only add prompt and systemPrompt if they exist
1844
+ if prompt:
1845
+ request_data['prompt'] = prompt
1846
+ if system_prompt:
1847
+ request_data['systemPrompt'] = system_prompt
1848
+
1849
+ if agent:
1850
+ request_data['agent'] = agent
1851
+
1852
+ # Add any additional kwargs
1853
+ request_data.update(kwargs)
1854
+
1855
+ try:
1856
+ # Send the initial extract request
1857
+ response = self._post_request(
1858
+ f'{self.api_url}/v1/extract',
1859
+ request_data,
1860
+ headers
1861
+ )
1862
+ if response.status_code == 200:
1863
+ try:
1864
+ data = response.json()
1865
+ except:
1866
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1867
+ if data['success']:
1868
+ job_id = data.get('id')
1869
+ if not job_id:
1870
+ raise Exception('Job ID not returned from extract request.')
1871
+
1872
+ # Poll for the extract status
1873
+ while True:
1874
+ status_response = self._get_request(
1875
+ f'{self.api_url}/v1/extract/{job_id}',
1876
+ headers
1877
+ )
1878
+ if status_response.status_code == 200:
1879
+ try:
1880
+ status_data = status_response.json()
1881
+ except:
1882
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1883
+ if status_data['status'] == 'completed':
1884
+ return ExtractResponse(**status_data)
1885
+ elif status_data['status'] in ['failed', 'cancelled']:
1886
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1887
+ else:
1888
+ self._handle_error(status_response, "extract-status")
1889
+
1890
+ time.sleep(2) # Polling interval
1891
+ else:
1892
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1893
+ else:
1894
+ self._handle_error(response, "extract")
1895
+ except Exception as e:
1896
+ raise ValueError(str(e), 500)
1897
+
1898
+ return ExtractResponse(success=False, error="Internal server error.")
1899
+
1900
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1901
+ """
1902
+ Retrieve the status of an extract job.
1903
+
1904
+ Args:
1905
+ job_id (str): The ID of the extract job.
1906
+
1907
+ Returns:
1908
+ ExtractResponse[Any]: The status of the extract job.
1909
+
1910
+ Raises:
1911
+ ValueError: If there is an error retrieving the status.
1912
+ """
1913
+ headers = self._prepare_headers()
1914
+ try:
1915
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1916
+ if response.status_code == 200:
1917
+ try:
1918
+ return ExtractResponse(**response.json())
1919
+ except:
1920
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1921
+ else:
1922
+ self._handle_error(response, "get extract status")
1923
+ except Exception as e:
1924
+ raise ValueError(str(e), 500)
1925
+
1926
+ def async_extract(
1927
+ self,
1928
+ urls: Optional[List[str]] = None,
1929
+ *,
1930
+ prompt: Optional[str] = None,
1931
+ schema: Optional[Any] = None,
1932
+ system_prompt: Optional[str] = None,
1933
+ allow_external_links: Optional[bool] = False,
1934
+ enable_web_search: Optional[bool] = False,
1935
+ show_sources: Optional[bool] = False,
1936
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1937
+ """
1938
+ Initiate an asynchronous extract job.
1939
+
1940
+ Args:
1941
+ urls (List[str]): URLs to extract information from
1942
+ prompt (Optional[str]): Custom extraction prompt
1943
+ schema (Optional[Any]): JSON schema/Pydantic model
1944
+ system_prompt (Optional[str]): System context
1945
+ allow_external_links (Optional[bool]): Follow external links
1946
+ enable_web_search (Optional[bool]): Enable web search
1947
+ show_sources (Optional[bool]): Include source URLs
1948
+ agent (Optional[Dict[str, Any]]): Agent configuration
1949
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1950
+
1951
+ Returns:
1952
+ ExtractResponse[Any] with:
1953
+ * success (bool): Whether request succeeded
1954
+ * data (Optional[Any]): Extracted data matching schema
1955
+ * error (Optional[str]): Error message if any
1956
+
1957
+ Raises:
1958
+ ValueError: If job initiation fails
1959
+ """
1960
+ headers = self._prepare_headers()
1961
+
1962
+ schema = schema
1963
+ if schema:
1964
+ schema = self._ensure_schema_dict(schema)
1965
+
1966
+ request_data = {
1967
+ 'urls': urls,
1968
+ 'allowExternalLinks': allow_external_links,
1969
+ 'enableWebSearch': enable_web_search,
1970
+ 'showSources': show_sources,
1971
+ 'schema': schema,
1972
+ 'origin': f'python-sdk@{version}'
1973
+ }
1974
+
1975
+ if prompt:
1976
+ request_data['prompt'] = prompt
1977
+ if system_prompt:
1978
+ request_data['systemPrompt'] = system_prompt
1979
+ if agent:
1980
+ request_data['agent'] = agent
1981
+
1982
+ try:
1983
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1984
+ if response.status_code == 200:
1985
+ try:
1986
+ return ExtractResponse(**response.json())
1987
+ except:
1988
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1989
+ else:
1990
+ self._handle_error(response, "async extract")
1991
+ except Exception as e:
1992
+ raise ValueError(str(e), 500)
1993
+
1994
+ def generate_llms_text(
1995
+ self,
1996
+ url: str,
1997
+ *,
1998
+ max_urls: Optional[int] = None,
1999
+ show_full_text: Optional[bool] = None,
2000
+ cache: Optional[bool] = None,
2001
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
2002
+ """
2003
+ Generate LLMs.txt for a given URL and poll until completion.
2004
+
2005
+ Args:
2006
+ url (str): Target URL to generate LLMs.txt from
2007
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2008
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2009
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2010
+ experimental_stream (Optional[bool]): Enable experimental streaming
2011
+
2012
+ Returns:
2013
+ GenerateLLMsTextStatusResponse with:
2014
+ * Generated LLMs.txt content
2015
+ * Full version if requested
2016
+ * Generation status
2017
+ * Success/error information
2018
+
2019
+ Raises:
2020
+ Exception: If generation fails
2021
+ """
2022
+ params = GenerateLLMsTextParams(
2023
+ maxUrls=max_urls,
2024
+ showFullText=show_full_text,
2025
+ cache=cache,
2026
+ __experimental_stream=experimental_stream
2027
+ )
2028
+
2029
+ response = self.async_generate_llms_text(
2030
+ url,
2031
+ max_urls=max_urls,
2032
+ show_full_text=show_full_text,
2033
+ cache=cache,
2034
+ experimental_stream=experimental_stream
2035
+ )
2036
+
2037
+ if not response.success or not response.id:
2038
+ return GenerateLLMsTextStatusResponse(
2039
+ success=False,
2040
+ error='Failed to start LLMs.txt generation',
2041
+ status='failed',
2042
+ expiresAt=''
2043
+ )
2044
+
2045
+ job_id = response.id
2046
+ while True:
2047
+ status = self.check_generate_llms_text_status(job_id)
2048
+
2049
+ if status.status == 'completed':
2050
+ return status
2051
+ elif status.status == 'failed':
2052
+ return status
2053
+ elif status.status != 'processing':
2054
+ return GenerateLLMsTextStatusResponse(
2055
+ success=False,
2056
+ error='LLMs.txt generation job terminated unexpectedly',
2057
+ status='failed',
2058
+ expiresAt=''
2059
+ )
2060
+
2061
+ time.sleep(2) # Polling interval
2062
+
2063
+ def async_generate_llms_text(
2064
+ self,
2065
+ url: str,
2066
+ *,
2067
+ max_urls: Optional[int] = None,
2068
+ show_full_text: Optional[bool] = None,
2069
+ cache: Optional[bool] = None,
2070
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
2071
+ """
2072
+ Initiate an asynchronous LLMs.txt generation operation.
2073
+
2074
+ Args:
2075
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2076
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
2077
+ show_full_text (Optional[bool]): Include full text in output (default: False)
2078
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
2079
+ experimental_stream (Optional[bool]): Enable experimental streaming
2080
+
2081
+ Returns:
2082
+ GenerateLLMsTextResponse: A response containing:
2083
+ * success (bool): Whether the generation initiation was successful
2084
+ * id (str): The unique identifier for the generation job
2085
+ * error (str, optional): Error message if initiation failed
2086
+
2087
+ Raises:
2088
+ Exception: If the generation job initiation fails.
2089
+ """
2090
+ params = GenerateLLMsTextParams(
2091
+ maxUrls=max_urls,
2092
+ showFullText=show_full_text,
2093
+ cache=cache,
2094
+ __experimental_stream=experimental_stream
2095
+ )
2096
+
2097
+ headers = self._prepare_headers()
2098
+ json_data = {'url': url, **params.dict(exclude_none=True)}
2099
+ json_data['origin'] = f"python-sdk@{version}"
2100
+
2101
+ try:
2102
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2103
+ response = req.json()
2104
+ print("json_data", json_data)
2105
+ print("response", response)
2106
+ if response.get('success'):
2107
+ try:
2108
+ return GenerateLLMsTextResponse(**response)
2109
+ except:
2110
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2111
+ else:
2112
+ self._handle_error(response, 'start LLMs.txt generation')
2113
+ except Exception as e:
2114
+ raise ValueError(str(e))
2115
+
2116
+ return GenerateLLMsTextResponse(
2117
+ success=False,
2118
+ error='Internal server error'
2119
+ )
2120
+
2121
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2122
+ """
2123
+ Check the status of a LLMs.txt generation operation.
2124
+
2125
+ Args:
2126
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
2127
+
2128
+ Returns:
2129
+ GenerateLLMsTextStatusResponse: A response containing:
2130
+ * success (bool): Whether the generation was successful
2131
+ * status (str): Status of generation ("processing", "completed", "failed")
2132
+ * data (Dict[str, str], optional): Generated text with fields:
2133
+ * llmstxt (str): Generated LLMs.txt content
2134
+ * llmsfulltxt (str, optional): Full version if requested
2135
+ * error (str, optional): Error message if generation failed
2136
+ * expiresAt (str): When the generated data expires
2137
+
2138
+ Raises:
2139
+ Exception: If the status check fails.
2140
+ """
2141
+ headers = self._prepare_headers()
2142
+ try:
2143
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2144
+ if response.status_code == 200:
2145
+ try:
2146
+ json_data = response.json()
2147
+ return GenerateLLMsTextStatusResponse(**json_data)
2148
+ except Exception as e:
2149
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2150
+ elif response.status_code == 404:
2151
+ raise Exception('LLMs.txt generation job not found')
2152
+ else:
2153
+ self._handle_error(response, 'check LLMs.txt generation status')
2154
+ except Exception as e:
2155
+ raise ValueError(str(e))
2156
+
2157
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2158
+
2159
+ def _prepare_headers(
2160
+ self,
2161
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
2162
+ """
2163
+ Prepare the headers for API requests.
2164
+
2165
+ Args:
2166
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2167
+
2168
+ Returns:
2169
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2170
+ """
2171
+ if idempotency_key:
2172
+ return {
2173
+ 'Content-Type': 'application/json',
2174
+ 'Authorization': f'Bearer {self.api_key}',
2175
+ 'x-idempotency-key': idempotency_key
2176
+ }
2177
+
2178
+ return {
2179
+ 'Content-Type': 'application/json',
2180
+ 'Authorization': f'Bearer {self.api_key}',
2181
+ }
2182
+
2183
+ def _post_request(
2184
+ self,
2185
+ url: str,
2186
+ data: Dict[str, Any],
2187
+ headers: Dict[str, str],
2188
+ retries: int = 3,
2189
+ backoff_factor: float = 0.5) -> requests.Response:
2190
+ """
2191
+ Make a POST request with retries.
2192
+
2193
+ Args:
2194
+ url (str): The URL to send the POST request to.
2195
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2196
+ headers (Dict[str, str]): The headers to include in the POST request.
2197
+ retries (int): Number of retries for the request.
2198
+ backoff_factor (float): Backoff factor for retries.
2199
+
2200
+ Returns:
2201
+ requests.Response: The response from the POST request.
2202
+
2203
+ Raises:
2204
+ requests.RequestException: If the request fails after the specified retries.
2205
+ """
2206
+ for attempt in range(retries):
2207
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2208
+ if response.status_code == 502:
2209
+ time.sleep(backoff_factor * (2 ** attempt))
2210
+ else:
2211
+ return response
2212
+ return response
2213
+
2214
+ def _get_request(
2215
+ self,
2216
+ url: str,
2217
+ headers: Dict[str, str],
2218
+ retries: int = 3,
2219
+ backoff_factor: float = 0.5) -> requests.Response:
2220
+ """
2221
+ Make a GET request with retries.
2222
+
2223
+ Args:
2224
+ url (str): The URL to send the GET request to.
2225
+ headers (Dict[str, str]): The headers to include in the GET request.
2226
+ retries (int): Number of retries for the request.
2227
+ backoff_factor (float): Backoff factor for retries.
2228
+
2229
+ Returns:
2230
+ requests.Response: The response from the GET request.
2231
+
2232
+ Raises:
2233
+ requests.RequestException: If the request fails after the specified retries.
2234
+ """
2235
+ for attempt in range(retries):
2236
+ response = requests.get(url, headers=headers)
2237
+ if response.status_code == 502:
2238
+ time.sleep(backoff_factor * (2 ** attempt))
2239
+ else:
2240
+ return response
2241
+ return response
2242
+
2243
+ def _delete_request(
2244
+ self,
2245
+ url: str,
2246
+ headers: Dict[str, str],
2247
+ retries: int = 3,
2248
+ backoff_factor: float = 0.5) -> requests.Response:
2249
+ """
2250
+ Make a DELETE request with retries.
2251
+
2252
+ Args:
2253
+ url (str): The URL to send the DELETE request to.
2254
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2255
+ retries (int): Number of retries for the request.
2256
+ backoff_factor (float): Backoff factor for retries.
2257
+
2258
+ Returns:
2259
+ requests.Response: The response from the DELETE request.
2260
+
2261
+ Raises:
2262
+ requests.RequestException: If the request fails after the specified retries.
2263
+ """
2264
+ for attempt in range(retries):
2265
+ response = requests.delete(url, headers=headers)
2266
+ if response.status_code == 502:
2267
+ time.sleep(backoff_factor * (2 ** attempt))
2268
+ else:
2269
+ return response
2270
+ return response
2271
+
2272
+ def _monitor_job_status(
2273
+ self,
2274
+ id: str,
2275
+ headers: Dict[str, str],
2276
+ poll_interval: int) -> CrawlStatusResponse:
2277
+ """
2278
+ Monitor the status of a crawl job until completion.
2279
+
2280
+ Args:
2281
+ id (str): The ID of the crawl job.
2282
+ headers (Dict[str, str]): The headers to include in the status check requests.
2283
+ poll_interval (int): Seconds between status checks.
2284
+
2285
+ Returns:
2286
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2287
+
2288
+ Raises:
2289
+ Exception: If the job fails or an error occurs during status checks.
2290
+ """
2291
+ while True:
2292
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2293
+
2294
+ status_response = self._get_request(api_url, headers)
2295
+ if status_response.status_code == 200:
2296
+ try:
2297
+ status_data = status_response.json()
2298
+ except:
2299
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2300
+ if status_data['status'] == 'completed':
2301
+ if 'data' in status_data:
2302
+ data = status_data['data']
2303
+ while 'next' in status_data:
2304
+ if len(status_data['data']) == 0:
2305
+ break
2306
+ status_response = self._get_request(status_data['next'], headers)
2307
+ try:
2308
+ status_data = status_response.json()
2309
+ except:
2310
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2311
+ data.extend(status_data.get('data', []))
2312
+ status_data['data'] = data
2313
+ return CrawlStatusResponse(**status_data)
2314
+ else:
2315
+ raise Exception('Crawl job completed but no data was returned')
2316
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2317
+ poll_interval=max(poll_interval,2)
2318
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2319
+ else:
2320
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2321
+ else:
2322
+ self._handle_error(status_response, 'check crawl status')
2323
+
2324
+ def _handle_error(
2325
+ self,
2326
+ response: requests.Response,
2327
+ action: str) -> None:
2328
+ """
2329
+ Handle errors from API responses.
2330
+
2331
+ Args:
2332
+ response (requests.Response): The response object from the API request.
2333
+ action (str): Description of the action that was being performed.
2334
+
2335
+ Raises:
2336
+ Exception: An exception with a message containing the status code and error details from the response.
2337
+ """
2338
+ try:
2339
+ error_message = response.json().get('error', 'No error message provided.')
2340
+ error_details = response.json().get('details', 'No additional error details provided.')
2341
+ except:
2342
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2343
+
2344
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2345
+
2346
+ # Raise an HTTPError with the custom message and attach the response
2347
+ raise requests.exceptions.HTTPError(message, response=response)
2348
+
2349
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2350
+ """
2351
+ Generate a standardized error message based on HTTP status code.
2352
+
2353
+ Args:
2354
+ status_code (int): The HTTP status code from the response
2355
+ action (str): Description of the action that was being performed
2356
+ error_message (str): The error message from the API response
2357
+ error_details (str): Additional error details from the API response
2358
+
2359
+ Returns:
2360
+ str: A formatted error message
2361
+ """
2362
+ if status_code == 402:
2363
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2364
+ elif status_code == 403:
2365
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2366
+ elif status_code == 408:
2367
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2368
+ elif status_code == 409:
2369
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2370
+ elif status_code == 500:
2371
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2372
+ else:
2373
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2374
+
2375
+ def deep_research(
2376
+ self,
2377
+ query: str,
2378
+ *,
2379
+ max_depth: Optional[int] = None,
2380
+ time_limit: Optional[int] = None,
2381
+ max_urls: Optional[int] = None,
2382
+ analysis_prompt: Optional[str] = None,
2383
+ system_prompt: Optional[str] = None,
2384
+ __experimental_stream_steps: Optional[bool] = None,
2385
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2386
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2387
+ """
2388
+ Initiates a deep research operation on a given query and polls until completion.
2389
+
2390
+ Args:
2391
+ query (str): Research query or topic to investigate
2392
+ max_depth (Optional[int]): Maximum depth of research exploration
2393
+ time_limit (Optional[int]): Time limit in seconds for research
2394
+ max_urls (Optional[int]): Maximum number of URLs to process
2395
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2396
+ system_prompt (Optional[str]): Custom system prompt
2397
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2398
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2399
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2400
+
2401
+ Returns:
2402
+ DeepResearchStatusResponse containing:
2403
+ * success (bool): Whether research completed successfully
2404
+ * status (str): Current state (processing/completed/failed)
2405
+ * error (Optional[str]): Error message if failed
2406
+ * id (str): Unique identifier for the research job
2407
+ * data (Any): Research findings and analysis
2408
+ * sources (List[Dict]): List of discovered sources
2409
+ * activities (List[Dict]): Research progress log
2410
+ * summaries (List[str]): Generated research summaries
2411
+
2412
+ Raises:
2413
+ Exception: If research fails
2414
+ """
2415
+ research_params = {}
2416
+ if max_depth is not None:
2417
+ research_params['maxDepth'] = max_depth
2418
+ if time_limit is not None:
2419
+ research_params['timeLimit'] = time_limit
2420
+ if max_urls is not None:
2421
+ research_params['maxUrls'] = max_urls
2422
+ if analysis_prompt is not None:
2423
+ research_params['analysisPrompt'] = analysis_prompt
2424
+ if system_prompt is not None:
2425
+ research_params['systemPrompt'] = system_prompt
2426
+ if __experimental_stream_steps is not None:
2427
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2428
+ research_params = DeepResearchParams(**research_params)
2429
+
2430
+ response = self.async_deep_research(
2431
+ query,
2432
+ max_depth=max_depth,
2433
+ time_limit=time_limit,
2434
+ max_urls=max_urls,
2435
+ analysis_prompt=analysis_prompt,
2436
+ system_prompt=system_prompt
2437
+ )
2438
+ if not response.get('success') or 'id' not in response:
2439
+ return response
2440
+
2441
+ job_id = response['id']
2442
+ last_activity_count = 0
2443
+ last_source_count = 0
2444
+
2445
+ while True:
2446
+ status = self.check_deep_research_status(job_id)
2447
+
2448
+ if on_activity and 'activities' in status:
2449
+ new_activities = status['activities'][last_activity_count:]
2450
+ for activity in new_activities:
2451
+ on_activity(activity)
2452
+ last_activity_count = len(status['activities'])
2453
+
2454
+ if on_source and 'sources' in status:
2455
+ new_sources = status['sources'][last_source_count:]
2456
+ for source in new_sources:
2457
+ on_source(source)
2458
+ last_source_count = len(status['sources'])
2459
+
2460
+ if status['status'] == 'completed':
2461
+ return status
2462
+ elif status['status'] == 'failed':
2463
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2464
+ elif status['status'] != 'processing':
2465
+ break
2466
+
2467
+ time.sleep(2) # Polling interval
2468
+
2469
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2470
+
2471
+ def async_deep_research(
2472
+ self,
2473
+ query: str,
2474
+ *,
2475
+ max_depth: Optional[int] = None,
2476
+ time_limit: Optional[int] = None,
2477
+ max_urls: Optional[int] = None,
2478
+ analysis_prompt: Optional[str] = None,
2479
+ system_prompt: Optional[str] = None,
2480
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2481
+ """
2482
+ Initiates an asynchronous deep research operation.
2483
+
2484
+ Args:
2485
+ query (str): Research query or topic to investigate
2486
+ max_depth (Optional[int]): Maximum depth of research exploration
2487
+ time_limit (Optional[int]): Time limit in seconds for research
2488
+ max_urls (Optional[int]): Maximum number of URLs to process
2489
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2490
+ system_prompt (Optional[str]): Custom system prompt
2491
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2492
+
2493
+ Returns:
2494
+ Dict[str, Any]: A response containing:
2495
+ * success (bool): Whether the research initiation was successful
2496
+ * id (str): The unique identifier for the research job
2497
+ * error (str, optional): Error message if initiation failed
2498
+
2499
+ Raises:
2500
+ Exception: If the research initiation fails.
2501
+ """
2502
+ research_params = {}
2503
+ if max_depth is not None:
2504
+ research_params['maxDepth'] = max_depth
2505
+ if time_limit is not None:
2506
+ research_params['timeLimit'] = time_limit
2507
+ if max_urls is not None:
2508
+ research_params['maxUrls'] = max_urls
2509
+ if analysis_prompt is not None:
2510
+ research_params['analysisPrompt'] = analysis_prompt
2511
+ if system_prompt is not None:
2512
+ research_params['systemPrompt'] = system_prompt
2513
+ if __experimental_stream_steps is not None:
2514
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2515
+ research_params = DeepResearchParams(**research_params)
2516
+
2517
+ headers = self._prepare_headers()
2518
+
2519
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2520
+ json_data['origin'] = f"python-sdk@{version}"
2521
+
2522
+ # Handle json options schema if present
2523
+ if 'jsonOptions' in json_data:
2524
+ json_opts = json_data['jsonOptions']
2525
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2526
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2527
+
2528
+ try:
2529
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2530
+ if response.status_code == 200:
2531
+ try:
2532
+ return response.json()
2533
+ except:
2534
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2535
+ else:
2536
+ self._handle_error(response, 'start deep research')
2537
+ except Exception as e:
2538
+ raise ValueError(str(e))
2539
+
2540
+ return {'success': False, 'error': 'Internal server error'}
2541
+
2542
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2543
+ """
2544
+ Check the status of a deep research operation.
2545
+
2546
+ Args:
2547
+ id (str): The ID of the deep research operation.
2548
+
2549
+ Returns:
2550
+ DeepResearchResponse containing:
2551
+
2552
+ Status:
2553
+ * success - Whether research completed successfully
2554
+ * status - Current state (processing/completed/failed)
2555
+ * error - Error message if failed
2556
+
2557
+ Results:
2558
+ * id - Unique identifier for the research job
2559
+ * data - Research findings and analysis
2560
+ * sources - List of discovered sources
2561
+ * activities - Research progress log
2562
+ * summaries - Generated research summaries
2563
+
2564
+ Raises:
2565
+ Exception: If the status check fails.
2566
+ """
2567
+ headers = self._prepare_headers()
2568
+ try:
2569
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2570
+ if response.status_code == 200:
2571
+ try:
2572
+ return response.json()
2573
+ except:
2574
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2575
+ elif response.status_code == 404:
2576
+ raise Exception('Deep research job not found')
2577
+ else:
2578
+ self._handle_error(response, 'check deep research status')
2579
+ except Exception as e:
2580
+ raise ValueError(str(e))
2581
+
2582
+ return {'success': False, 'error': 'Internal server error'}
2583
+
2584
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2585
+ """
2586
+ Validate additional keyword arguments before they are passed to the API.
2587
+ This provides early validation before the Pydantic model validation.
2588
+
2589
+ Args:
2590
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2591
+ method_name (str): Name of the method these kwargs are for
2592
+
2593
+ Raises:
2594
+ ValueError: If kwargs contain invalid or unsupported parameters
2595
+ """
2596
+ if not kwargs:
2597
+ return
2598
+
2599
+ # Known parameter mappings for each method
2600
+ method_params = {
2601
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2602
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2603
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "max_age", "integration"},
2604
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
2605
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2606
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2607
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url", "integration"},
2608
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout", "integration"},
2609
+ "extract": {"prompt", "schema", "system_prompt", "allow_external_links", "enable_web_search", "show_sources", "agent", "integration"},
2610
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2611
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2612
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2613
+ "actions", "agent", "webhook"},
2614
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2615
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2616
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2617
+ "actions", "agent", "webhook"},
2618
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2619
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2620
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2621
+ "actions", "agent", "webhook"}
2622
+ }
2623
+
2624
+ # Get allowed parameters for this method
2625
+ allowed_params = method_params.get(method_name, set())
2626
+
2627
+ # Check for unknown parameters
2628
+ unknown_params = set(kwargs.keys()) - allowed_params
2629
+ if unknown_params:
2630
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2631
+
2632
+ # Additional type validation can be added here if needed
2633
+ # For now, we rely on Pydantic models for detailed type validation
2634
+
2635
+ def _ensure_schema_dict(self, schema):
2636
+ """
2637
+ Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2638
+ """
2639
+ if schema is None:
2640
+ return schema
2641
+ if isinstance(schema, type):
2642
+ # Pydantic v1/v2 model class
2643
+ if hasattr(schema, 'model_json_schema'):
2644
+ return schema.model_json_schema()
2645
+ elif hasattr(schema, 'schema'):
2646
+ return schema.schema()
2647
+ if isinstance(schema, dict):
2648
+ return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2649
+ if isinstance(schema, (list, tuple)):
2650
+ return [self._ensure_schema_dict(v) for v in schema]
2651
+ return schema
2652
+
2653
+ class CrawlWatcher:
2654
+ """
2655
+ A class to watch and handle crawl job events via WebSocket connection.
2656
+
2657
+ Attributes:
2658
+ id (str): The ID of the crawl job to watch
2659
+ app (FirecrawlApp): The FirecrawlApp instance
2660
+ data (List[Dict[str, Any]]): List of crawled documents/data
2661
+ status (str): Current status of the crawl job
2662
+ ws_url (str): WebSocket URL for the crawl job
2663
+ event_handlers (dict): Dictionary of event type to list of handler functions
2664
+ """
2665
+ def __init__(self, id: str, app: FirecrawlApp):
2666
+ self.id = id
2667
+ self.app = app
2668
+ self.data: List[Dict[str, Any]] = []
2669
+ self.status = "scraping"
2670
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2671
+ self.event_handlers = {
2672
+ 'done': [],
2673
+ 'error': [],
2674
+ 'document': []
2675
+ }
2676
+
2677
+ async def connect(self) -> None:
2678
+ """
2679
+ Establishes WebSocket connection and starts listening for messages.
2680
+ """
2681
+ async with websockets.connect(
2682
+ self.ws_url,
2683
+ max_size=None,
2684
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2685
+ ) as websocket:
2686
+ await self._listen(websocket)
2687
+
2688
+ async def _listen(self, websocket) -> None:
2689
+ """
2690
+ Listens for incoming WebSocket messages and handles them.
2691
+
2692
+ Args:
2693
+ websocket: The WebSocket connection object
2694
+ """
2695
+ async for message in websocket:
2696
+ msg = json.loads(message)
2697
+ await self._handle_message(msg)
2698
+
2699
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2700
+ """
2701
+ Adds an event handler function for a specific event type.
2702
+
2703
+ Args:
2704
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2705
+ handler (Callable): Function to handle the event
2706
+ """
2707
+ if event_type in self.event_handlers:
2708
+ self.event_handlers[event_type].append(handler)
2709
+
2710
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2711
+ """
2712
+ Dispatches an event to all registered handlers for that event type.
2713
+
2714
+ Args:
2715
+ event_type (str): Type of event to dispatch
2716
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2717
+ """
2718
+ if event_type in self.event_handlers:
2719
+ for handler in self.event_handlers[event_type]:
2720
+ handler(detail)
2721
+
2722
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2723
+ """
2724
+ Handles incoming WebSocket messages based on their type.
2725
+
2726
+ Args:
2727
+ msg (Dict[str, Any]): The message to handle
2728
+ """
2729
+ if msg['type'] == 'done':
2730
+ self.status = 'completed'
2731
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2732
+ elif msg['type'] == 'error':
2733
+ self.status = 'failed'
2734
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2735
+ elif msg['type'] == 'catchup':
2736
+ self.status = msg['data']['status']
2737
+ self.data.extend(msg['data'].get('data', []))
2738
+ for doc in self.data:
2739
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2740
+ elif msg['type'] == 'document':
2741
+ self.data.append(msg['data'])
2742
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2743
+
2744
+ class AsyncFirecrawlApp(FirecrawlApp):
2745
+ """
2746
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2747
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2748
+ """
2749
+
2750
+ async def _async_request(
2751
+ self,
2752
+ method: str,
2753
+ url: str,
2754
+ headers: Dict[str, str],
2755
+ data: Optional[Dict[str, Any]] = None,
2756
+ retries: int = 3,
2757
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2758
+ """
2759
+ Generic async request method with exponential backoff retry logic.
2760
+
2761
+ Args:
2762
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2763
+ url (str): The URL to send the request to.
2764
+ headers (Dict[str, str]): Headers to include in the request.
2765
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2766
+ retries (int): Maximum number of retry attempts (default: 3).
2767
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2768
+ Delay will be backoff_factor * (2 ** retry_count).
2769
+
2770
+ Returns:
2771
+ Dict[str, Any]: The parsed JSON response from the server.
2772
+
2773
+ Raises:
2774
+ aiohttp.ClientError: If the request fails after all retries.
2775
+ Exception: If max retries are exceeded or other errors occur.
2776
+ """
2777
+ async with aiohttp.ClientSession() as session:
2778
+ for attempt in range(retries):
2779
+ try:
2780
+ async with session.request(
2781
+ method=method, url=url, headers=headers, json=data
2782
+ ) as response:
2783
+ if response.status == 502:
2784
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2785
+ continue
2786
+ if response.status >= 300:
2787
+ await self._handle_error(response, f"make {method} request")
2788
+ return await response.json()
2789
+ except aiohttp.ClientError as e:
2790
+ if attempt == retries - 1:
2791
+ raise e
2792
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2793
+ raise Exception("Max retries exceeded")
2794
+
2795
+ async def _async_post_request(
2796
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2797
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2798
+ """
2799
+ Make an async POST request with exponential backoff retry logic.
2800
+
2801
+ Args:
2802
+ url (str): The URL to send the POST request to.
2803
+ data (Dict[str, Any]): The JSON data to include in the request body.
2804
+ headers (Dict[str, str]): Headers to include in the request.
2805
+ retries (int): Maximum number of retry attempts (default: 3).
2806
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2807
+ Delay will be backoff_factor * (2 ** retry_count).
2808
+
2809
+ Returns:
2810
+ Dict[str, Any]: The parsed JSON response from the server.
2811
+
2812
+ Raises:
2813
+ aiohttp.ClientError: If the request fails after all retries.
2814
+ Exception: If max retries are exceeded or other errors occur.
2815
+ """
2816
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2817
+
2818
+ async def _async_get_request(
2819
+ self, url: str, headers: Dict[str, str],
2820
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2821
+ """
2822
+ Make an async GET request with exponential backoff retry logic.
2823
+
2824
+ Args:
2825
+ url (str): The URL to send the GET request to.
2826
+ headers (Dict[str, str]): Headers to include in the request.
2827
+ retries (int): Maximum number of retry attempts (default: 3).
2828
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2829
+ Delay will be backoff_factor * (2 ** retry_count).
2830
+
2831
+ Returns:
2832
+ Dict[str, Any]: The parsed JSON response from the server.
2833
+
2834
+ Raises:
2835
+ aiohttp.ClientError: If the request fails after all retries.
2836
+ Exception: If max retries are exceeded or other errors occur.
2837
+ """
2838
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2839
+
2840
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2841
+ """
2842
+ Handle errors from async API responses with detailed error messages.
2843
+
2844
+ Args:
2845
+ response (aiohttp.ClientResponse): The response object from the failed request
2846
+ action (str): Description of the action that was being attempted
2847
+
2848
+ Raises:
2849
+ aiohttp.ClientError: With a detailed error message based on the response status:
2850
+ - 402: Payment Required
2851
+ - 408: Request Timeout
2852
+ - 409: Conflict
2853
+ - 500: Internal Server Error
2854
+ - Other: Unexpected error with status code
2855
+ """
2856
+ try:
2857
+ error_data = await response.json()
2858
+ error_message = error_data.get('error', 'No error message provided.')
2859
+ error_details = error_data.get('details', 'No additional error details provided.')
2860
+ except:
2861
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2862
+
2863
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2864
+
2865
+ raise aiohttp.ClientError(message)
2866
+
2867
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2868
+ """
2869
+ Generate a standardized error message based on HTTP status code for async operations.
2870
+
2871
+ Args:
2872
+ status_code (int): The HTTP status code from the response
2873
+ action (str): Description of the action that was being performed
2874
+ error_message (str): The error message from the API response
2875
+ error_details (str): Additional error details from the API response
2876
+
2877
+ Returns:
2878
+ str: A formatted error message
2879
+ """
2880
+ return self._get_error_message(status_code, action, error_message, error_details)
2881
+
2882
+ async def crawl_url_and_watch(
2883
+ self,
2884
+ url: str,
2885
+ params: Optional[CrawlParams] = None,
2886
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2887
+ """
2888
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2889
+
2890
+ Args:
2891
+ url (str): Target URL to start crawling from
2892
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2893
+ URL Discovery:
2894
+ * includePaths - Patterns of URLs to include
2895
+ * excludePaths - Patterns of URLs to exclude
2896
+ * maxDepth - Maximum crawl depth
2897
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2898
+ * limit - Maximum pages to crawl
2899
+
2900
+ Link Following:
2901
+ * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2902
+ * crawlEntireDomain - Follow parent directory links
2903
+ * allowExternalLinks - Follow external domain links
2904
+ * ignoreSitemap - Skip sitemap.xml processing
2905
+
2906
+ Advanced:
2907
+ * scrapeOptions - Page scraping configuration
2908
+ * webhook - Notification webhook settings
2909
+ * deduplicateSimilarURLs - Remove similar URLs
2910
+ * ignoreQueryParameters - Ignore URL parameters
2911
+ * regexOnFullURL - Apply regex to full URLs
2912
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2913
+
2914
+ Returns:
2915
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2916
+
2917
+ Raises:
2918
+ Exception: If crawl job fails to start
2919
+ """
2920
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2921
+ if crawl_response.get('success') and 'id' in crawl_response:
2922
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2923
+ else:
2924
+ raise Exception("Crawl job failed to start")
2925
+
2926
+ async def batch_scrape_urls_and_watch(
2927
+ self,
2928
+ urls: List[str],
2929
+ params: Optional[ScrapeParams] = None,
2930
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2931
+ """
2932
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2933
+
2934
+ Args:
2935
+ urls (List[str]): List of URLs to scrape
2936
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2937
+
2938
+ Content Options:
2939
+ * formats - Content formats to retrieve
2940
+ * includeTags - HTML tags to include
2941
+ * excludeTags - HTML tags to exclude
2942
+ * onlyMainContent - Extract main content only
2943
+
2944
+ Request Options:
2945
+ * headers - Custom HTTP headers
2946
+ * timeout - Request timeout (ms)
2947
+ * mobile - Use mobile user agent
2948
+ * proxy - Proxy type
2949
+
2950
+ Extraction Options:
2951
+ * extract - Content extraction config
2952
+ * jsonOptions - JSON extraction config
2953
+ * actions - Actions to perform
2954
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2955
+
2956
+ Returns:
2957
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2958
+
2959
+ Raises:
2960
+ Exception: If batch scrape job fails to start
2961
+ """
2962
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2963
+ if batch_response.get('success') and 'id' in batch_response:
2964
+ return AsyncCrawlWatcher(batch_response['id'], self)
2965
+ else:
2966
+ raise Exception("Batch scrape job failed to start")
2967
+
2968
+ async def scrape_url(
2969
+ self,
2970
+ url: str,
2971
+ *,
2972
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2973
+ headers: Optional[Dict[str, str]] = None,
2974
+ include_tags: Optional[List[str]] = None,
2975
+ exclude_tags: Optional[List[str]] = None,
2976
+ only_main_content: Optional[bool] = None,
2977
+ wait_for: Optional[int] = None,
2978
+ timeout: Optional[int] = None,
2979
+ location: Optional[LocationConfig] = None,
2980
+ mobile: Optional[bool] = None,
2981
+ skip_tls_verification: Optional[bool] = None,
2982
+ remove_base64_images: Optional[bool] = None,
2983
+ block_ads: Optional[bool] = None,
2984
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2985
+ parse_pdf: Optional[bool] = None,
2986
+ extract: Optional[JsonConfig] = None,
2987
+ json_options: Optional[JsonConfig] = None,
2988
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
2989
+ **kwargs) -> ScrapeResponse[Any]:
2990
+ """
2991
+ Scrape a single URL asynchronously.
2992
+
2993
+ Args:
2994
+ url (str): Target URL to scrape
2995
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2996
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2997
+ include_tags (Optional[List[str]]): HTML tags to include
2998
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2999
+ only_main_content (Optional[bool]): Extract main content only
3000
+ wait_for (Optional[int]): Wait for a specific element to appear
3001
+ timeout (Optional[int]): Request timeout (ms)
3002
+ location (Optional[LocationConfig]): Location configuration
3003
+ mobile (Optional[bool]): Use mobile user agent
3004
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3005
+ remove_base64_images (Optional[bool]): Remove base64 images
3006
+ block_ads (Optional[bool]): Block ads
3007
+ proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
3008
+ extract (Optional[JsonConfig]): Content extraction settings
3009
+ json_options (Optional[JsonConfig]): JSON extraction settings
3010
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
3011
+ **kwargs: Additional parameters to pass to the API
3012
+
3013
+ Returns:
3014
+ ScrapeResponse with:
3015
+ * success - Whether scrape was successful
3016
+ * markdown - Markdown content if requested
3017
+ * html - HTML content if requested
3018
+ * rawHtml - Raw HTML content if requested
3019
+ * links - Extracted links if requested
3020
+ * screenshot - Screenshot if requested
3021
+ * extract - Extracted data if requested
3022
+ * json - JSON data if requested
3023
+ * error - Error message if scrape failed
3024
+
3025
+ Raises:
3026
+ Exception: If scraping fails
3027
+ """
3028
+ # Validate any additional kwargs
3029
+ self._validate_kwargs(kwargs, "scrape_url")
3030
+
3031
+ _headers = self._prepare_headers()
3032
+
3033
+ # Build scrape parameters
3034
+ scrape_params = {
3035
+ 'url': url,
3036
+ 'origin': f"python-sdk@{version}"
3037
+ }
3038
+
3039
+ # Add optional parameters if provided and not None
3040
+ if formats:
3041
+ scrape_params['formats'] = formats
3042
+ if headers:
3043
+ scrape_params['headers'] = headers
3044
+ if include_tags:
3045
+ scrape_params['includeTags'] = include_tags
3046
+ if exclude_tags:
3047
+ scrape_params['excludeTags'] = exclude_tags
3048
+ if only_main_content is not None:
3049
+ scrape_params['onlyMainContent'] = only_main_content
3050
+ if wait_for:
3051
+ scrape_params['waitFor'] = wait_for
3052
+ if timeout:
3053
+ scrape_params['timeout'] = timeout
3054
+ if location:
3055
+ scrape_params['location'] = location.dict(exclude_none=True)
3056
+ if mobile is not None:
3057
+ scrape_params['mobile'] = mobile
3058
+ if skip_tls_verification is not None:
3059
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3060
+ if remove_base64_images is not None:
3061
+ scrape_params['removeBase64Images'] = remove_base64_images
3062
+ if block_ads is not None:
3063
+ scrape_params['blockAds'] = block_ads
3064
+ if proxy:
3065
+ scrape_params['proxy'] = proxy
3066
+ if parse_pdf is not None:
3067
+ scrape_params['parsePDF'] = parse_pdf
3068
+ if extract is not None:
3069
+ extract = self._ensure_schema_dict(extract)
3070
+ if isinstance(extract, dict) and "schema" in extract:
3071
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3072
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3073
+ if json_options is not None:
3074
+ json_options = self._ensure_schema_dict(json_options)
3075
+ if isinstance(json_options, dict) and "schema" in json_options:
3076
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3077
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3078
+ if actions:
3079
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
3080
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
3081
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
3082
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3083
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3084
+
3085
+ # Make async request
3086
+ endpoint = f'/v1/scrape'
3087
+ response = await self._async_post_request(
3088
+ f'{self.api_url}{endpoint}',
3089
+ scrape_params,
3090
+ _headers
3091
+ )
3092
+
3093
+ if response.get('success') and 'data' in response:
3094
+ return ScrapeResponse(**response['data'])
3095
+ elif "error" in response:
3096
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3097
+ else:
3098
+ # Use the response content directly if possible, otherwise a generic message
3099
+ error_content = response.get('error', str(response))
3100
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
3101
+
3102
+ async def batch_scrape_urls(
3103
+ self,
3104
+ urls: List[str],
3105
+ *,
3106
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3107
+ headers: Optional[Dict[str, str]] = None,
3108
+ include_tags: Optional[List[str]] = None,
3109
+ exclude_tags: Optional[List[str]] = None,
3110
+ only_main_content: Optional[bool] = None,
3111
+ wait_for: Optional[int] = None,
3112
+ timeout: Optional[int] = None,
3113
+ location: Optional[LocationConfig] = None,
3114
+ mobile: Optional[bool] = None,
3115
+ skip_tls_verification: Optional[bool] = None,
3116
+ remove_base64_images: Optional[bool] = None,
3117
+ block_ads: Optional[bool] = None,
3118
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3119
+ extract: Optional[JsonConfig] = None,
3120
+ json_options: Optional[JsonConfig] = None,
3121
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3122
+ agent: Optional[AgentOptions] = None,
3123
+ poll_interval: Optional[int] = 2,
3124
+ idempotency_key: Optional[str] = None,
3125
+ **kwargs
3126
+ ) -> BatchScrapeStatusResponse:
3127
+ """
3128
+ Asynchronously scrape multiple URLs and monitor until completion.
3129
+
3130
+ Args:
3131
+ urls (List[str]): URLs to scrape
3132
+ formats (Optional[List[Literal]]): Content formats to retrieve
3133
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3134
+ include_tags (Optional[List[str]]): HTML tags to include
3135
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3136
+ only_main_content (Optional[bool]): Extract main content only
3137
+ wait_for (Optional[int]): Wait time in milliseconds
3138
+ timeout (Optional[int]): Request timeout in milliseconds
3139
+ location (Optional[LocationConfig]): Location configuration
3140
+ mobile (Optional[bool]): Use mobile user agent
3141
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3142
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3143
+ block_ads (Optional[bool]): Block advertisements
3144
+ proxy (Optional[Literal]): Proxy type to use
3145
+ extract (Optional[JsonConfig]): Content extraction config
3146
+ json_options (Optional[JsonConfig]): JSON extraction config
3147
+ actions (Optional[List[Union]]): Actions to perform
3148
+ agent (Optional[AgentOptions]): Agent configuration
3149
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3150
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3151
+ **kwargs: Additional parameters to pass to the API
3152
+
3153
+ Returns:
3154
+ BatchScrapeStatusResponse with:
3155
+ * Scraping status and progress
3156
+ * Scraped content for each URL
3157
+ * Success/error information
3158
+
3159
+ Raises:
3160
+ Exception: If batch scrape fails
3161
+ """
3162
+ # Validate any additional kwargs
3163
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
3164
+
3165
+ scrape_params = {}
3166
+
3167
+ # Add individual parameters
3168
+ if formats is not None:
3169
+ scrape_params['formats'] = formats
3170
+ if headers is not None:
3171
+ scrape_params['headers'] = headers
3172
+ if include_tags is not None:
3173
+ scrape_params['includeTags'] = include_tags
3174
+ if exclude_tags is not None:
3175
+ scrape_params['excludeTags'] = exclude_tags
3176
+ if only_main_content is not None:
3177
+ scrape_params['onlyMainContent'] = only_main_content
3178
+ if wait_for is not None:
3179
+ scrape_params['waitFor'] = wait_for
3180
+ if timeout is not None:
3181
+ scrape_params['timeout'] = timeout
3182
+ if location is not None:
3183
+ scrape_params['location'] = location.dict(exclude_none=True)
3184
+ if mobile is not None:
3185
+ scrape_params['mobile'] = mobile
3186
+ if skip_tls_verification is not None:
3187
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3188
+ if remove_base64_images is not None:
3189
+ scrape_params['removeBase64Images'] = remove_base64_images
3190
+ if block_ads is not None:
3191
+ scrape_params['blockAds'] = block_ads
3192
+ if proxy is not None:
3193
+ scrape_params['proxy'] = proxy
3194
+ if extract is not None:
3195
+ extract = self._ensure_schema_dict(extract)
3196
+ if isinstance(extract, dict) and "schema" in extract:
3197
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3198
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3199
+ if json_options is not None:
3200
+ json_options = self._ensure_schema_dict(json_options)
3201
+ if isinstance(json_options, dict) and "schema" in json_options:
3202
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3203
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3204
+
3205
+ if agent is not None:
3206
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3207
+
3208
+ # Add any additional kwargs
3209
+ scrape_params.update(kwargs)
3210
+
3211
+ # Create final params object
3212
+ final_params = ScrapeParams(**scrape_params)
3213
+ params_dict = final_params.dict(exclude_none=True)
3214
+ params_dict['urls'] = urls
3215
+ params_dict['origin'] = f"python-sdk@{version}"
3216
+
3217
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3218
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3219
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3220
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3221
+
3222
+ # Make request
3223
+ headers = self._prepare_headers(idempotency_key)
3224
+ response = await self._async_post_request(
3225
+ f'{self.api_url}/v1/batch/scrape',
3226
+ params_dict,
3227
+ headers
3228
+ )
3229
+
3230
+ if response.get('success'):
3231
+ try:
3232
+ id = response.get('id')
3233
+ except:
3234
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3235
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3236
+ else:
3237
+ self._handle_error(response, 'start batch scrape job')
3238
+
3239
+
3240
+ async def async_batch_scrape_urls(
3241
+ self,
3242
+ urls: List[str],
3243
+ *,
3244
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3245
+ headers: Optional[Dict[str, str]] = None,
3246
+ include_tags: Optional[List[str]] = None,
3247
+ exclude_tags: Optional[List[str]] = None,
3248
+ only_main_content: Optional[bool] = None,
3249
+ wait_for: Optional[int] = None,
3250
+ timeout: Optional[int] = None,
3251
+ location: Optional[LocationConfig] = None,
3252
+ mobile: Optional[bool] = None,
3253
+ skip_tls_verification: Optional[bool] = None,
3254
+ remove_base64_images: Optional[bool] = None,
3255
+ block_ads: Optional[bool] = None,
3256
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3257
+ extract: Optional[JsonConfig] = None,
3258
+ json_options: Optional[JsonConfig] = None,
3259
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3260
+ agent: Optional[AgentOptions] = None,
3261
+ zero_data_retention: Optional[bool] = None,
3262
+ idempotency_key: Optional[str] = None,
3263
+ **kwargs
3264
+ ) -> BatchScrapeResponse:
3265
+ """
3266
+ Initiate a batch scrape job asynchronously.
3267
+
3268
+ Args:
3269
+ urls (List[str]): URLs to scrape
3270
+ formats (Optional[List[Literal]]): Content formats to retrieve
3271
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3272
+ include_tags (Optional[List[str]]): HTML tags to include
3273
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3274
+ only_main_content (Optional[bool]): Extract main content only
3275
+ wait_for (Optional[int]): Wait time in milliseconds
3276
+ timeout (Optional[int]): Request timeout in milliseconds
3277
+ location (Optional[LocationConfig]): Location configuration
3278
+ mobile (Optional[bool]): Use mobile user agent
3279
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3280
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3281
+ block_ads (Optional[bool]): Block advertisements
3282
+ proxy (Optional[Literal]): Proxy type to use
3283
+ extract (Optional[JsonConfig]): Content extraction config
3284
+ json_options (Optional[JsonConfig]): JSON extraction config
3285
+ actions (Optional[List[Union]]): Actions to perform
3286
+ agent (Optional[AgentOptions]): Agent configuration
3287
+ zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
3288
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3289
+ **kwargs: Additional parameters to pass to the API
3290
+
3291
+ Returns:
3292
+ BatchScrapeResponse with:
3293
+ * success - Whether job started successfully
3294
+ * id - Unique identifier for the job
3295
+ * url - Status check URL
3296
+ * error - Error message if start failed
3297
+
3298
+ Raises:
3299
+ Exception: If job initiation fails
3300
+ """
3301
+ # Validate any additional kwargs
3302
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3303
+
3304
+ scrape_params = {}
3305
+
3306
+ # Add individual parameters
3307
+ if formats is not None:
3308
+ scrape_params['formats'] = formats
3309
+ if headers is not None:
3310
+ scrape_params['headers'] = headers
3311
+ if include_tags is not None:
3312
+ scrape_params['includeTags'] = include_tags
3313
+ if exclude_tags is not None:
3314
+ scrape_params['excludeTags'] = exclude_tags
3315
+ if only_main_content is not None:
3316
+ scrape_params['onlyMainContent'] = only_main_content
3317
+ if wait_for is not None:
3318
+ scrape_params['waitFor'] = wait_for
3319
+ if timeout is not None:
3320
+ scrape_params['timeout'] = timeout
3321
+ if location is not None:
3322
+ scrape_params['location'] = location.dict(exclude_none=True)
3323
+ if mobile is not None:
3324
+ scrape_params['mobile'] = mobile
3325
+ if skip_tls_verification is not None:
3326
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3327
+ if remove_base64_images is not None:
3328
+ scrape_params['removeBase64Images'] = remove_base64_images
3329
+ if block_ads is not None:
3330
+ scrape_params['blockAds'] = block_ads
3331
+ if proxy is not None:
3332
+ scrape_params['proxy'] = proxy
3333
+ if extract is not None:
3334
+ extract = self._ensure_schema_dict(extract)
3335
+ if isinstance(extract, dict) and "schema" in extract:
3336
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3337
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3338
+ if json_options is not None:
3339
+ json_options = self._ensure_schema_dict(json_options)
3340
+ if isinstance(json_options, dict) and "schema" in json_options:
3341
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3342
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3343
+ if actions:
3344
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
3345
+ if agent is not None:
3346
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3347
+ if zero_data_retention is not None:
3348
+ scrape_params['zeroDataRetention'] = zero_data_retention
3349
+
3350
+ # Add any additional kwargs
3351
+ scrape_params.update(kwargs)
3352
+
3353
+ # Create final params object
3354
+ final_params = ScrapeParams(**scrape_params)
3355
+ params_dict = final_params.dict(exclude_none=True)
3356
+ params_dict['urls'] = urls
3357
+ params_dict['origin'] = f"python-sdk@{version}"
3358
+
3359
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3360
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3361
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3362
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3363
+
3364
+ # Make request
3365
+ headers = self._prepare_headers(idempotency_key)
3366
+ response = await self._async_post_request(
3367
+ f'{self.api_url}/v1/batch/scrape',
3368
+ params_dict,
3369
+ headers
3370
+ )
3371
+
3372
+ if response.get('status_code') == 200:
3373
+ try:
3374
+ return BatchScrapeResponse(**response.json())
3375
+ except:
3376
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3377
+ else:
3378
+ await self._handle_error(response, 'start batch scrape job')
3379
+
3380
+ async def crawl_url(
3381
+ self,
3382
+ url: str,
3383
+ *,
3384
+ include_paths: Optional[List[str]] = None,
3385
+ exclude_paths: Optional[List[str]] = None,
3386
+ max_depth: Optional[int] = None,
3387
+ max_discovery_depth: Optional[int] = None,
3388
+ limit: Optional[int] = None,
3389
+ allow_backward_links: Optional[bool] = None,
3390
+ crawl_entire_domain: Optional[bool] = None,
3391
+ allow_external_links: Optional[bool] = None,
3392
+ ignore_sitemap: Optional[bool] = None,
3393
+ scrape_options: Optional[ScrapeOptions] = None,
3394
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3395
+ deduplicate_similar_urls: Optional[bool] = None,
3396
+ ignore_query_parameters: Optional[bool] = None,
3397
+ regex_on_full_url: Optional[bool] = None,
3398
+ delay: Optional[int] = None,
3399
+ allow_subdomains: Optional[bool] = None,
3400
+ poll_interval: Optional[int] = 2,
3401
+ idempotency_key: Optional[str] = None,
3402
+ **kwargs
3403
+ ) -> CrawlStatusResponse:
3404
+ """
3405
+ Crawl a website starting from a URL.
3406
+
3407
+ Args:
3408
+ url (str): Target URL to start crawling from
3409
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3410
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3411
+ max_depth (Optional[int]): Maximum crawl depth
3412
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3413
+ limit (Optional[int]): Maximum pages to crawl
3414
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3415
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3416
+ allow_external_links (Optional[bool]): Follow external domain links
3417
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3418
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3419
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3420
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3421
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3422
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3423
+ delay (Optional[int]): Delay in seconds between scrapes
3424
+ allow_subdomains (Optional[bool]): Follow subdomains
3425
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3426
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3427
+ **kwargs: Additional parameters to pass to the API
3428
+
3429
+ Returns:
3430
+ CrawlStatusResponse with:
3431
+ * Crawling status and progress
3432
+ * Crawled page contents
3433
+ * Success/error information
3434
+
3435
+ Raises:
3436
+ Exception: If crawl fails
3437
+ """
3438
+ # Validate any additional kwargs
3439
+ self._validate_kwargs(kwargs, "crawl_url")
3440
+
3441
+ crawl_params = {}
3442
+
3443
+ # Add individual parameters
3444
+ if include_paths is not None:
3445
+ crawl_params['includePaths'] = include_paths
3446
+ if exclude_paths is not None:
3447
+ crawl_params['excludePaths'] = exclude_paths
3448
+ if max_depth is not None:
3449
+ crawl_params['maxDepth'] = max_depth
3450
+ if max_discovery_depth is not None:
3451
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3452
+ if limit is not None:
3453
+ crawl_params['limit'] = limit
3454
+ if crawl_entire_domain is not None:
3455
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3456
+ elif allow_backward_links is not None:
3457
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3458
+ if allow_external_links is not None:
3459
+ crawl_params['allowExternalLinks'] = allow_external_links
3460
+ if ignore_sitemap is not None:
3461
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3462
+ if scrape_options is not None:
3463
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3464
+ if webhook is not None:
3465
+ crawl_params['webhook'] = webhook
3466
+ if deduplicate_similar_urls is not None:
3467
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3468
+ if ignore_query_parameters is not None:
3469
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3470
+ if regex_on_full_url is not None:
3471
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3472
+ if delay is not None:
3473
+ crawl_params['delay'] = delay
3474
+ if allow_subdomains is not None:
3475
+ crawl_params['allowSubdomains'] = allow_subdomains
3476
+
3477
+ # Add any additional kwargs
3478
+ crawl_params.update(kwargs)
3479
+
3480
+ # Create final params object
3481
+ final_params = CrawlParams(**crawl_params)
3482
+ params_dict = final_params.dict(exclude_none=True)
3483
+ params_dict['url'] = url
3484
+ params_dict['origin'] = f"python-sdk@{version}"
3485
+ # Make request
3486
+ headers = self._prepare_headers(idempotency_key)
3487
+ response = await self._async_post_request(
3488
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3489
+
3490
+ if response.get('success'):
3491
+ try:
3492
+ id = response.get('id')
3493
+ except:
3494
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3495
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3496
+ else:
3497
+ await self._handle_error(response, 'start crawl job')
3498
+
3499
+
3500
+ async def async_crawl_url(
3501
+ self,
3502
+ url: str,
3503
+ *,
3504
+ include_paths: Optional[List[str]] = None,
3505
+ exclude_paths: Optional[List[str]] = None,
3506
+ max_depth: Optional[int] = None,
3507
+ max_discovery_depth: Optional[int] = None,
3508
+ limit: Optional[int] = None,
3509
+ allow_backward_links: Optional[bool] = None,
3510
+ crawl_entire_domain: Optional[bool] = None,
3511
+ allow_external_links: Optional[bool] = None,
3512
+ ignore_sitemap: Optional[bool] = None,
3513
+ scrape_options: Optional[ScrapeOptions] = None,
3514
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3515
+ deduplicate_similar_urls: Optional[bool] = None,
3516
+ ignore_query_parameters: Optional[bool] = None,
3517
+ regex_on_full_url: Optional[bool] = None,
3518
+ delay: Optional[int] = None,
3519
+ allow_subdomains: Optional[bool] = None,
3520
+ poll_interval: Optional[int] = 2,
3521
+ idempotency_key: Optional[str] = None,
3522
+ **kwargs
3523
+ ) -> CrawlResponse:
3524
+ """
3525
+ Start an asynchronous crawl job.
3526
+
3527
+ Args:
3528
+ url (str): Target URL to start crawling from
3529
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3530
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3531
+ max_depth (Optional[int]): Maximum crawl depth
3532
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3533
+ limit (Optional[int]): Maximum pages to crawl
3534
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3535
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3536
+ allow_external_links (Optional[bool]): Follow external domain links
3537
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3538
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3539
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3540
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3541
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3542
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3543
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3544
+ **kwargs: Additional parameters to pass to the API
3545
+
3546
+ Returns:
3547
+ CrawlResponse with:
3548
+ * success - Whether crawl started successfully
3549
+ * id - Unique identifier for the crawl job
3550
+ * url - Status check URL for the crawl
3551
+ * error - Error message if start failed
3552
+
3553
+ Raises:
3554
+ Exception: If crawl initiation fails
3555
+ """
3556
+ crawl_params = {}
3557
+
3558
+ # Add individual parameters
3559
+ if include_paths is not None:
3560
+ crawl_params['includePaths'] = include_paths
3561
+ if exclude_paths is not None:
3562
+ crawl_params['excludePaths'] = exclude_paths
3563
+ if max_depth is not None:
3564
+ crawl_params['maxDepth'] = max_depth
3565
+ if max_discovery_depth is not None:
3566
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3567
+ if limit is not None:
3568
+ crawl_params['limit'] = limit
3569
+ if crawl_entire_domain is not None:
3570
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3571
+ elif allow_backward_links is not None:
3572
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3573
+ if allow_external_links is not None:
3574
+ crawl_params['allowExternalLinks'] = allow_external_links
3575
+ if ignore_sitemap is not None:
3576
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3577
+ if scrape_options is not None:
3578
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3579
+ if webhook is not None:
3580
+ crawl_params['webhook'] = webhook
3581
+ if deduplicate_similar_urls is not None:
3582
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3583
+ if ignore_query_parameters is not None:
3584
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3585
+ if regex_on_full_url is not None:
3586
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3587
+ if delay is not None:
3588
+ crawl_params['delay'] = delay
3589
+ if allow_subdomains is not None:
3590
+ crawl_params['allowSubdomains'] = allow_subdomains
3591
+
3592
+ # Add any additional kwargs
3593
+ crawl_params.update(kwargs)
3594
+
3595
+ # Create final params object
3596
+ final_params = CrawlParams(**crawl_params)
3597
+ params_dict = final_params.dict(exclude_none=True)
3598
+ params_dict['url'] = url
3599
+ params_dict['origin'] = f"python-sdk@{version}"
3600
+
3601
+ # Make request
3602
+ headers = self._prepare_headers(idempotency_key)
3603
+ response = await self._async_post_request(
3604
+ f'{self.api_url}/v1/crawl',
3605
+ params_dict,
3606
+ headers
3607
+ )
3608
+
3609
+ if response.get('success'):
3610
+ try:
3611
+ return CrawlResponse(**response)
3612
+ except:
3613
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3614
+ else:
3615
+ await self._handle_error(response, 'start crawl job')
3616
+
3617
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3618
+ """
3619
+ Check the status and results of an asynchronous crawl job.
3620
+
3621
+ Args:
3622
+ id (str): Unique identifier for the crawl job
3623
+
3624
+ Returns:
3625
+ CrawlStatusResponse containing:
3626
+ Status Information:
3627
+ * status - Current state (scraping/completed/failed/cancelled)
3628
+ * completed - Number of pages crawled
3629
+ * total - Total pages to crawl
3630
+ * creditsUsed - API credits consumed
3631
+ * expiresAt - Data expiration timestamp
3632
+
3633
+ Results:
3634
+ * data - List of crawled documents
3635
+ * next - URL for next page of results (if paginated)
3636
+ * success - Whether status check succeeded
3637
+ * error - Error message if failed
3638
+
3639
+ Raises:
3640
+ Exception: If status check fails
3641
+ """
3642
+ headers = self._prepare_headers()
3643
+ endpoint = f'/v1/crawl/{id}'
3644
+
3645
+ status_data = await self._async_get_request(
3646
+ f'{self.api_url}{endpoint}',
3647
+ headers
3648
+ )
3649
+
3650
+ if status_data.get('status') == 'completed':
3651
+ if 'data' in status_data:
3652
+ data = status_data['data']
3653
+ while 'next' in status_data:
3654
+ if len(status_data['data']) == 0:
3655
+ break
3656
+ next_url = status_data.get('next')
3657
+ if not next_url:
3658
+ logger.warning("Expected 'next' URL is missing.")
3659
+ break
3660
+ next_data = await self._async_get_request(next_url, headers)
3661
+ data.extend(next_data.get('data', []))
3662
+ status_data = next_data
3663
+ status_data['data'] = data
3664
+ # Create CrawlStatusResponse object from status data
3665
+ response = CrawlStatusResponse(
3666
+ status=status_data.get('status'),
3667
+ total=status_data.get('total'),
3668
+ completed=status_data.get('completed'),
3669
+ creditsUsed=status_data.get('creditsUsed'),
3670
+ expiresAt=status_data.get('expiresAt'),
3671
+ data=status_data.get('data'),
3672
+ success=False if 'error' in status_data else True
3673
+ )
3674
+
3675
+ if 'error' in status_data:
3676
+ response.error = status_data.get('error')
3677
+
3678
+ if 'next' in status_data:
3679
+ response.next = status_data.get('next')
3680
+
3681
+ return response
3682
+
3683
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3684
+ """
3685
+ Monitor the status of an asynchronous job until completion.
3686
+
3687
+ Args:
3688
+ id (str): The ID of the job to monitor
3689
+ headers (Dict[str, str]): Headers to include in status check requests
3690
+ poll_interval (int): Seconds between status checks (default: 2)
3691
+
3692
+ Returns:
3693
+ CrawlStatusResponse: The job results if completed successfully
3694
+
3695
+ Raises:
3696
+ Exception: If the job fails or an error occurs during status checks
3697
+ """
3698
+ while True:
3699
+ status_data = await self._async_get_request(
3700
+ f'{self.api_url}/v1/crawl/{id}',
3701
+ headers
3702
+ )
3703
+
3704
+ if status_data.get('status') == 'completed':
3705
+ if 'data' in status_data:
3706
+ data = status_data['data']
3707
+ while 'next' in status_data:
3708
+ if len(status_data['data']) == 0:
3709
+ break
3710
+ next_url = status_data.get('next')
3711
+ if not next_url:
3712
+ logger.warning("Expected 'next' URL is missing.")
3713
+ break
3714
+ next_data = await self._async_get_request(next_url, headers)
3715
+ data.extend(next_data.get('data', []))
3716
+ status_data = next_data
3717
+ status_data['data'] = data
3718
+ return CrawlStatusResponse(**status_data)
3719
+ else:
3720
+ raise Exception('Job completed but no data was returned')
3721
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3722
+ await asyncio.sleep(max(poll_interval, 2))
3723
+ else:
3724
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3725
+
3726
+ async def map_url(
3727
+ self,
3728
+ url: str,
3729
+ *,
3730
+ search: Optional[str] = None,
3731
+ ignore_sitemap: Optional[bool] = None,
3732
+ include_subdomains: Optional[bool] = None,
3733
+ sitemap_only: Optional[bool] = None,
3734
+ limit: Optional[int] = None,
3735
+ timeout: Optional[int] = None,
3736
+ params: Optional[MapParams] = None) -> MapResponse:
3737
+ """
3738
+ Asynchronously map and discover links from a URL.
3739
+
3740
+ Args:
3741
+ url (str): Target URL to map
3742
+ params (Optional[MapParams]): See MapParams model:
3743
+ Discovery Options:
3744
+ * search - Filter pattern for URLs
3745
+ * ignoreSitemap - Skip sitemap.xml
3746
+ * includeSubdomains - Include subdomain links
3747
+ * sitemapOnly - Only use sitemap.xml
3748
+
3749
+ Limits:
3750
+ * limit - Max URLs to return
3751
+ * timeout - Request timeout (ms)
3752
+
3753
+ Returns:
3754
+ MapResponse with:
3755
+ * Discovered URLs
3756
+ * Success/error status
3757
+
3758
+ Raises:
3759
+ Exception: If mapping fails
3760
+ """
3761
+ map_params = {}
3762
+ if params:
3763
+ map_params.update(params.dict(exclude_none=True))
3764
+
3765
+ # Add individual parameters
3766
+ if search is not None:
3767
+ map_params['search'] = search
3768
+ if ignore_sitemap is not None:
3769
+ map_params['ignoreSitemap'] = ignore_sitemap
3770
+ if include_subdomains is not None:
3771
+ map_params['includeSubdomains'] = include_subdomains
3772
+ if sitemap_only is not None:
3773
+ map_params['sitemapOnly'] = sitemap_only
3774
+ if limit is not None:
3775
+ map_params['limit'] = limit
3776
+ if timeout is not None:
3777
+ map_params['timeout'] = timeout
3778
+
3779
+ # Create final params object
3780
+ final_params = MapParams(**map_params)
3781
+ params_dict = final_params.dict(exclude_none=True)
3782
+ params_dict['url'] = url
3783
+ params_dict['origin'] = f"python-sdk@{version}"
3784
+
3785
+ # Make request
3786
+ endpoint = f'/v1/map'
3787
+ response = await self._async_post_request(
3788
+ f'{self.api_url}{endpoint}',
3789
+ params_dict,
3790
+ headers={"Authorization": f"Bearer {self.api_key}"}
3791
+ )
3792
+
3793
+ if response.get('success') and 'links' in response:
3794
+ return MapResponse(**response)
3795
+ elif 'error' in response:
3796
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3797
+ else:
3798
+ raise Exception(f'Failed to map URL. Error: {response}')
3799
+
3800
+ async def extract(
3801
+ self,
3802
+ urls: Optional[List[str]] = None,
3803
+ *,
3804
+ prompt: Optional[str] = None,
3805
+ schema: Optional[Any] = None,
3806
+ system_prompt: Optional[str] = None,
3807
+ allow_external_links: Optional[bool] = False,
3808
+ enable_web_search: Optional[bool] = False,
3809
+ show_sources: Optional[bool] = False,
3810
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3811
+
3812
+ """
3813
+ Asynchronously extract structured information from URLs.
3814
+
3815
+ Args:
3816
+ urls (Optional[List[str]]): URLs to extract from
3817
+ prompt (Optional[str]): Custom extraction prompt
3818
+ schema (Optional[Any]): JSON schema/Pydantic model
3819
+ system_prompt (Optional[str]): System context
3820
+ allow_external_links (Optional[bool]): Follow external links
3821
+ enable_web_search (Optional[bool]): Enable web search
3822
+ show_sources (Optional[bool]): Include source URLs
3823
+ agent (Optional[Dict[str, Any]]): Agent configuration
3824
+
3825
+ Returns:
3826
+ ExtractResponse with:
3827
+ * Structured data matching schema
3828
+ * Source information if requested
3829
+ * Success/error status
3830
+
3831
+ Raises:
3832
+ ValueError: If prompt/schema missing or extraction fails
3833
+ """
3834
+ headers = self._prepare_headers()
3835
+
3836
+ if not prompt and not schema:
3837
+ raise ValueError("Either prompt or schema is required")
3838
+
3839
+ if not urls and not prompt:
3840
+ raise ValueError("Either urls or prompt is required")
3841
+
3842
+ if schema:
3843
+ schema = self._ensure_schema_dict(schema)
3844
+
3845
+ request_data = {
3846
+ 'urls': urls or [],
3847
+ 'allowExternalLinks': allow_external_links,
3848
+ 'enableWebSearch': enable_web_search,
3849
+ 'showSources': show_sources,
3850
+ 'schema': schema,
3851
+ 'origin': f'python-sdk@{get_version()}'
3852
+ }
3853
+
3854
+ # Only add prompt and systemPrompt if they exist
3855
+ if prompt:
3856
+ request_data['prompt'] = prompt
3857
+ if system_prompt:
3858
+ request_data['systemPrompt'] = system_prompt
3859
+
3860
+ if agent:
3861
+ request_data['agent'] = agent
3862
+
3863
+ response = await self._async_post_request(
3864
+ f'{self.api_url}/v1/extract',
3865
+ request_data,
3866
+ headers
3867
+ )
3868
+
3869
+ if response.get('success'):
3870
+ job_id = response.get('id')
3871
+ if not job_id:
3872
+ raise Exception('Job ID not returned from extract request.')
3873
+
3874
+ while True:
3875
+ status_data = await self._async_get_request(
3876
+ f'{self.api_url}/v1/extract/{job_id}',
3877
+ headers
3878
+ )
3879
+
3880
+ if status_data['status'] == 'completed':
3881
+ return ExtractResponse(**status_data)
3882
+ elif status_data['status'] in ['failed', 'cancelled']:
3883
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3884
+
3885
+ await asyncio.sleep(2)
3886
+ else:
3887
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3888
+
3889
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3890
+ """
3891
+ Check the status of an asynchronous batch scrape job.
3892
+
3893
+ Args:
3894
+ id (str): The ID of the batch scrape job
3895
+
3896
+ Returns:
3897
+ BatchScrapeStatusResponse containing:
3898
+ Status Information:
3899
+ * status - Current state (scraping/completed/failed/cancelled)
3900
+ * completed - Number of URLs scraped
3901
+ * total - Total URLs to scrape
3902
+ * creditsUsed - API credits consumed
3903
+ * expiresAt - Data expiration timestamp
3904
+
3905
+ Results:
3906
+ * data - List of scraped documents
3907
+ * next - URL for next page of results (if paginated)
3908
+ * success - Whether status check succeeded
3909
+ * error - Error message if failed
3910
+
3911
+ Raises:
3912
+ Exception: If status check fails
3913
+ """
3914
+ headers = self._prepare_headers()
3915
+ endpoint = f'/v1/batch/scrape/{id}'
3916
+
3917
+ status_data = await self._async_get_request(
3918
+ f'{self.api_url}{endpoint}',
3919
+ headers
3920
+ )
3921
+
3922
+ if status_data['status'] == 'completed':
3923
+ if 'data' in status_data:
3924
+ data = status_data['data']
3925
+ while 'next' in status_data:
3926
+ if len(status_data['data']) == 0:
3927
+ break
3928
+ next_url = status_data.get('next')
3929
+ if not next_url:
3930
+ logger.warning("Expected 'next' URL is missing.")
3931
+ break
3932
+ next_data = await self._async_get_request(next_url, headers)
3933
+ data.extend(next_data.get('data', []))
3934
+ status_data = next_data
3935
+ status_data['data'] = data
3936
+
3937
+ response = BatchScrapeStatusResponse(
3938
+ status=status_data.get('status'),
3939
+ total=status_data.get('total'),
3940
+ completed=status_data.get('completed'),
3941
+ creditsUsed=status_data.get('creditsUsed'),
3942
+ expiresAt=status_data.get('expiresAt'),
3943
+ data=status_data.get('data')
3944
+ )
3945
+
3946
+ if 'error' in status_data:
3947
+ response['error'] = status_data['error']
3948
+
3949
+ if 'next' in status_data:
3950
+ response['next'] = status_data['next']
3951
+
3952
+ return {
3953
+ 'success': False if 'error' in status_data else True,
3954
+ **response
3955
+ }
3956
+
3957
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3958
+ """
3959
+ Get information about errors from an asynchronous batch scrape job.
3960
+
3961
+ Args:
3962
+ id (str): The ID of the batch scrape job
3963
+
3964
+ Returns:
3965
+ CrawlErrorsResponse containing:
3966
+ errors (List[Dict[str, str]]): List of errors with fields:
3967
+ * id (str): Error ID
3968
+ * timestamp (str): When the error occurred
3969
+ * url (str): URL that caused the error
3970
+ * error (str): Error message
3971
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3972
+
3973
+ Raises:
3974
+ Exception: If error check fails
3975
+ """
3976
+ headers = self._prepare_headers()
3977
+ return await self._async_get_request(
3978
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3979
+ headers
3980
+ )
3981
+
3982
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3983
+ """
3984
+ Get information about errors from an asynchronous crawl job.
3985
+
3986
+ Args:
3987
+ id (str): The ID of the crawl job
3988
+
3989
+ Returns:
3990
+ CrawlErrorsResponse containing:
3991
+ * errors (List[Dict[str, str]]): List of errors with fields:
3992
+ - id (str): Error ID
3993
+ - timestamp (str): When the error occurred
3994
+ - url (str): URL that caused the error
3995
+ - error (str): Error message
3996
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3997
+
3998
+ Raises:
3999
+ Exception: If error check fails
4000
+ """
4001
+ headers = self._prepare_headers()
4002
+ return await self._async_get_request(
4003
+ f'{self.api_url}/v1/crawl/{id}/errors',
4004
+ headers
4005
+ )
4006
+
4007
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
4008
+ """
4009
+ Cancel an asynchronous crawl job.
4010
+
4011
+ Args:
4012
+ id (str): The ID of the crawl job to cancel
4013
+
4014
+ Returns:
4015
+ Dict[str, Any] containing:
4016
+ * success (bool): Whether cancellation was successful
4017
+ * error (str, optional): Error message if cancellation failed
4018
+
4019
+ Raises:
4020
+ Exception: If cancellation fails
4021
+ """
4022
+ headers = self._prepare_headers()
4023
+ async with aiohttp.ClientSession() as session:
4024
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
4025
+ return await response.json()
4026
+
4027
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
4028
+ """
4029
+ Check the status of an asynchronous extraction job.
4030
+
4031
+ Args:
4032
+ job_id (str): The ID of the extraction job
4033
+
4034
+ Returns:
4035
+ ExtractResponse[Any] with:
4036
+ * success (bool): Whether request succeeded
4037
+ * data (Optional[Any]): Extracted data matching schema
4038
+ * error (Optional[str]): Error message if any
4039
+ * warning (Optional[str]): Warning message if any
4040
+ * sources (Optional[List[str]]): Source URLs if requested
4041
+
4042
+ Raises:
4043
+ ValueError: If status check fails
4044
+ """
4045
+ headers = self._prepare_headers()
4046
+ try:
4047
+ return await self._async_get_request(
4048
+ f'{self.api_url}/v1/extract/{job_id}',
4049
+ headers
4050
+ )
4051
+ except Exception as e:
4052
+ raise ValueError(str(e))
4053
+
4054
+ async def async_extract(
4055
+ self,
4056
+ urls: Optional[List[str]] = None,
4057
+ *,
4058
+ prompt: Optional[str] = None,
4059
+ schema: Optional[Any] = None,
4060
+ system_prompt: Optional[str] = None,
4061
+ allow_external_links: Optional[bool] = False,
4062
+ enable_web_search: Optional[bool] = False,
4063
+ show_sources: Optional[bool] = False,
4064
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
4065
+ """
4066
+ Initiate an asynchronous extraction job without waiting for completion.
4067
+
4068
+ Args:
4069
+ urls (Optional[List[str]]): URLs to extract from
4070
+ prompt (Optional[str]): Custom extraction prompt
4071
+ schema (Optional[Any]): JSON schema/Pydantic model
4072
+ system_prompt (Optional[str]): System context
4073
+ allow_external_links (Optional[bool]): Follow external links
4074
+ enable_web_search (Optional[bool]): Enable web search
4075
+ show_sources (Optional[bool]): Include source URLs
4076
+ agent (Optional[Dict[str, Any]]): Agent configuration
4077
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
4078
+
4079
+ Returns:
4080
+ ExtractResponse[Any] with:
4081
+ * success (bool): Whether request succeeded
4082
+ * data (Optional[Any]): Extracted data matching schema
4083
+ * error (Optional[str]): Error message if any
4084
+
4085
+ Raises:
4086
+ ValueError: If job initiation fails
4087
+ """
4088
+ headers = self._prepare_headers()
4089
+
4090
+ if not prompt and not schema:
4091
+ raise ValueError("Either prompt or schema is required")
4092
+
4093
+ if not urls and not prompt:
4094
+ raise ValueError("Either urls or prompt is required")
4095
+
4096
+ if schema:
4097
+ schema = self._ensure_schema_dict(schema)
4098
+
4099
+ request_data = ExtractResponse(
4100
+ urls=urls or [],
4101
+ allowExternalLinks=allow_external_links,
4102
+ enableWebSearch=enable_web_search,
4103
+ showSources=show_sources,
4104
+ schema=schema,
4105
+ origin=f'python-sdk@{version}'
4106
+ )
4107
+
4108
+ if prompt:
4109
+ request_data['prompt'] = prompt
4110
+ if system_prompt:
4111
+ request_data['systemPrompt'] = system_prompt
4112
+ if agent:
4113
+ request_data['agent'] = agent
4114
+
4115
+ try:
4116
+ return await self._async_post_request(
4117
+ f'{self.api_url}/v1/extract',
4118
+ request_data,
4119
+ headers
4120
+ )
4121
+ except Exception as e:
4122
+ raise ValueError(str(e))
4123
+
4124
+ async def generate_llms_text(
4125
+ self,
4126
+ url: str,
4127
+ *,
4128
+ max_urls: Optional[int] = None,
4129
+ show_full_text: Optional[bool] = None,
4130
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
4131
+ """
4132
+ Generate LLMs.txt for a given URL and monitor until completion.
4133
+
4134
+ Args:
4135
+ url (str): Target URL to generate LLMs.txt from
4136
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4137
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4138
+ experimental_stream (Optional[bool]): Enable experimental streaming
4139
+
4140
+ Returns:
4141
+ GenerateLLMsTextStatusResponse containing:
4142
+ * success (bool): Whether generation completed successfully
4143
+ * status (str): Status of generation (processing/completed/failed)
4144
+ * data (Dict[str, str], optional): Generated text with fields:
4145
+ - llmstxt (str): Generated LLMs.txt content
4146
+ - llmsfulltxt (str, optional): Full version if requested
4147
+ * error (str, optional): Error message if generation failed
4148
+ * expiresAt (str): When the generated data expires
4149
+
4150
+ Raises:
4151
+ Exception: If generation fails
4152
+ """
4153
+ params = {}
4154
+ if max_urls is not None:
4155
+ params['maxUrls'] = max_urls
4156
+ if show_full_text is not None:
4157
+ params['showFullText'] = show_full_text
4158
+ if experimental_stream is not None:
4159
+ params['__experimental_stream'] = experimental_stream
4160
+
4161
+ response = await self.async_generate_llms_text(
4162
+ url,
4163
+ max_urls=max_urls,
4164
+ show_full_text=show_full_text,
4165
+ cache=cache,
4166
+ experimental_stream=experimental_stream
4167
+ )
4168
+ if not response.get('success') or 'id' not in response:
4169
+ return response
4170
+
4171
+ job_id = response['id']
4172
+ while True:
4173
+ status = await self.check_generate_llms_text_status(job_id)
4174
+
4175
+ if status['status'] == 'completed':
4176
+ return status
4177
+ elif status['status'] == 'failed':
4178
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4179
+ elif status['status'] != 'processing':
4180
+ break
4181
+
4182
+ await asyncio.sleep(2)
4183
+
4184
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4185
+
4186
+ async def async_generate_llms_text(
4187
+ self,
4188
+ url: str,
4189
+ *,
4190
+ max_urls: Optional[int] = None,
4191
+ show_full_text: Optional[bool] = None,
4192
+ cache: Optional[bool] = None,
4193
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4194
+ """
4195
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4196
+
4197
+ Args:
4198
+ url (str): Target URL to generate LLMs.txt from
4199
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4200
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4201
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
4202
+ experimental_stream (Optional[bool]): Enable experimental streaming
4203
+
4204
+ Returns:
4205
+ GenerateLLMsTextResponse containing:
4206
+ * success (bool): Whether job started successfully
4207
+ * id (str): Unique identifier for the job
4208
+ * error (str, optional): Error message if start failed
4209
+
4210
+ Raises:
4211
+ ValueError: If job initiation fails
4212
+ """
4213
+ params = {}
4214
+ if max_urls is not None:
4215
+ params['maxUrls'] = max_urls
4216
+ if show_full_text is not None:
4217
+ params['showFullText'] = show_full_text
4218
+ if experimental_stream is not None:
4219
+ params['__experimental_stream'] = experimental_stream
4220
+
4221
+ params = GenerateLLMsTextParams(
4222
+ maxUrls=max_urls,
4223
+ showFullText=show_full_text,
4224
+ cache=cache,
4225
+ __experimental_stream=experimental_stream
4226
+ )
4227
+
4228
+ headers = self._prepare_headers()
4229
+ json_data = {'url': url, **params.dict(exclude_none=True)}
4230
+ json_data['origin'] = f"python-sdk@{version}"
4231
+
4232
+ try:
4233
+ return await self._async_post_request(
4234
+ f'{self.api_url}/v1/llmstxt',
4235
+ json_data,
4236
+ headers
4237
+ )
4238
+ except Exception as e:
4239
+ raise ValueError(str(e))
4240
+
4241
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4242
+ """
4243
+ Check the status of an asynchronous LLMs.txt generation job.
4244
+
4245
+ Args:
4246
+ id (str): The ID of the generation job
4247
+
4248
+ Returns:
4249
+ GenerateLLMsTextStatusResponse containing:
4250
+ * success (bool): Whether generation completed successfully
4251
+ * status (str): Status of generation (processing/completed/failed)
4252
+ * data (Dict[str, str], optional): Generated text with fields:
4253
+ - llmstxt (str): Generated LLMs.txt content
4254
+ - llmsfulltxt (str, optional): Full version if requested
4255
+ * error (str, optional): Error message if generation failed
4256
+ * expiresAt (str): When the generated data expires
4257
+
4258
+ Raises:
4259
+ ValueError: If status check fails
4260
+ """
4261
+ headers = self._prepare_headers()
4262
+ try:
4263
+ return await self._async_get_request(
4264
+ f'{self.api_url}/v1/llmstxt/{id}',
4265
+ headers
4266
+ )
4267
+ except Exception as e:
4268
+ raise ValueError(str(e))
4269
+
4270
+ async def deep_research(
4271
+ self,
4272
+ query: str,
4273
+ *,
4274
+ max_depth: Optional[int] = None,
4275
+ time_limit: Optional[int] = None,
4276
+ max_urls: Optional[int] = None,
4277
+ analysis_prompt: Optional[str] = None,
4278
+ system_prompt: Optional[str] = None,
4279
+ __experimental_stream_steps: Optional[bool] = None,
4280
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4281
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4282
+ """
4283
+ Initiates a deep research operation on a given query and polls until completion.
4284
+
4285
+ Args:
4286
+ query (str): Research query or topic to investigate
4287
+ max_depth (Optional[int]): Maximum depth of research exploration
4288
+ time_limit (Optional[int]): Time limit in seconds for research
4289
+ max_urls (Optional[int]): Maximum number of URLs to process
4290
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4291
+ system_prompt (Optional[str]): Custom system prompt
4292
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4293
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4294
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4295
+
4296
+ Returns:
4297
+ DeepResearchStatusResponse containing:
4298
+ * success (bool): Whether research completed successfully
4299
+ * status (str): Current state (processing/completed/failed)
4300
+ * error (Optional[str]): Error message if failed
4301
+ * id (str): Unique identifier for the research job
4302
+ * data (Any): Research findings and analysis
4303
+ * sources (List[Dict]): List of discovered sources
4304
+ * activities (List[Dict]): Research progress log
4305
+ * summaries (List[str]): Generated research summaries
4306
+
4307
+ Raises:
4308
+ Exception: If research fails
4309
+ """
4310
+ research_params = {}
4311
+ if max_depth is not None:
4312
+ research_params['maxDepth'] = max_depth
4313
+ if time_limit is not None:
4314
+ research_params['timeLimit'] = time_limit
4315
+ if max_urls is not None:
4316
+ research_params['maxUrls'] = max_urls
4317
+ if analysis_prompt is not None:
4318
+ research_params['analysisPrompt'] = analysis_prompt
4319
+ if system_prompt is not None:
4320
+ research_params['systemPrompt'] = system_prompt
4321
+ if __experimental_stream_steps is not None:
4322
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4323
+ research_params = DeepResearchParams(**research_params)
4324
+
4325
+ response = await self.async_deep_research(
4326
+ query,
4327
+ max_depth=max_depth,
4328
+ time_limit=time_limit,
4329
+ max_urls=max_urls,
4330
+ analysis_prompt=analysis_prompt,
4331
+ system_prompt=system_prompt
4332
+ )
4333
+ if not response.get('success') or 'id' not in response:
4334
+ return response
4335
+
4336
+ job_id = response['id']
4337
+ last_activity_count = 0
4338
+ last_source_count = 0
4339
+
4340
+ while True:
4341
+ status = await self.check_deep_research_status(job_id)
4342
+
4343
+ if on_activity and 'activities' in status:
4344
+ new_activities = status['activities'][last_activity_count:]
4345
+ for activity in new_activities:
4346
+ on_activity(activity)
4347
+ last_activity_count = len(status['activities'])
4348
+
4349
+ if on_source and 'sources' in status:
4350
+ new_sources = status['sources'][last_source_count:]
4351
+ for source in new_sources:
4352
+ on_source(source)
4353
+ last_source_count = len(status['sources'])
4354
+
4355
+ if status['status'] == 'completed':
4356
+ return status
4357
+ elif status['status'] == 'failed':
4358
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4359
+ elif status['status'] != 'processing':
4360
+ break
4361
+
4362
+ await asyncio.sleep(2)
4363
+
4364
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4365
+
4366
+ async def async_deep_research(
4367
+ self,
4368
+ query: str,
4369
+ *,
4370
+ max_depth: Optional[int] = None,
4371
+ time_limit: Optional[int] = None,
4372
+ max_urls: Optional[int] = None,
4373
+ analysis_prompt: Optional[str] = None,
4374
+ system_prompt: Optional[str] = None,
4375
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4376
+ """
4377
+ Initiates an asynchronous deep research operation.
4378
+
4379
+ Args:
4380
+ query (str): Research query or topic to investigate
4381
+ max_depth (Optional[int]): Maximum depth of research exploration
4382
+ time_limit (Optional[int]): Time limit in seconds for research
4383
+ max_urls (Optional[int]): Maximum number of URLs to process
4384
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4385
+ system_prompt (Optional[str]): Custom system prompt
4386
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4387
+
4388
+ Returns:
4389
+ Dict[str, Any]: A response containing:
4390
+ * success (bool): Whether the research initiation was successful
4391
+ * id (str): The unique identifier for the research job
4392
+ * error (str, optional): Error message if initiation failed
4393
+
4394
+ Raises:
4395
+ Exception: If the research initiation fails.
4396
+ """
4397
+ research_params = {}
4398
+ if max_depth is not None:
4399
+ research_params['maxDepth'] = max_depth
4400
+ if time_limit is not None:
4401
+ research_params['timeLimit'] = time_limit
4402
+ if max_urls is not None:
4403
+ research_params['maxUrls'] = max_urls
4404
+ if analysis_prompt is not None:
4405
+ research_params['analysisPrompt'] = analysis_prompt
4406
+ if system_prompt is not None:
4407
+ research_params['systemPrompt'] = system_prompt
4408
+ if __experimental_stream_steps is not None:
4409
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4410
+ research_params = DeepResearchParams(**research_params)
4411
+
4412
+ headers = self._prepare_headers()
4413
+
4414
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4415
+ json_data['origin'] = f"python-sdk@{version}"
4416
+
4417
+ try:
4418
+ return await self._async_post_request(
4419
+ f'{self.api_url}/v1/deep-research',
4420
+ json_data,
4421
+ headers
4422
+ )
4423
+ except Exception as e:
4424
+ raise ValueError(str(e))
4425
+
4426
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4427
+ """
4428
+ Check the status of a deep research operation.
4429
+
4430
+ Args:
4431
+ id (str): The ID of the deep research operation.
4432
+
4433
+ Returns:
4434
+ DeepResearchResponse containing:
4435
+
4436
+ Status:
4437
+ * success - Whether research completed successfully
4438
+ * status - Current state (processing/completed/failed)
4439
+ * error - Error message if failed
4440
+
4441
+ Results:
4442
+ * id - Unique identifier for the research job
4443
+ * data - Research findings and analysis
4444
+ * sources - List of discovered sources
4445
+ * activities - Research progress log
4446
+ * summaries - Generated research summaries
4447
+
4448
+ Raises:
4449
+ Exception: If the status check fails.
4450
+ """
4451
+ headers = self._prepare_headers()
4452
+ try:
4453
+ return await self._async_get_request(
4454
+ f'{self.api_url}/v1/deep-research/{id}',
4455
+ headers
4456
+ )
4457
+ except Exception as e:
4458
+ raise ValueError(str(e))
4459
+
4460
+ async def search(
4461
+ self,
4462
+ query: str,
4463
+ *,
4464
+ limit: Optional[int] = None,
4465
+ tbs: Optional[str] = None,
4466
+ filter: Optional[str] = None,
4467
+ lang: Optional[str] = None,
4468
+ country: Optional[str] = None,
4469
+ location: Optional[str] = None,
4470
+ timeout: Optional[int] = None,
4471
+ scrape_options: Optional[ScrapeOptions] = None,
4472
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4473
+ **kwargs) -> SearchResponse:
4474
+ """
4475
+ Asynchronously search for content using Firecrawl.
4476
+
4477
+ Args:
4478
+ query (str): Search query string
4479
+ limit (Optional[int]): Max results (default: 5)
4480
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4481
+ filter (Optional[str]): Custom result filter
4482
+ lang (Optional[str]): Language code (default: "en")
4483
+ country (Optional[str]): Country code (default: "us")
4484
+ location (Optional[str]): Geo-targeting
4485
+ timeout (Optional[int]): Request timeout in milliseconds
4486
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4487
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4488
+ **kwargs: Additional keyword arguments for future compatibility
4489
+
4490
+ Returns:
4491
+ SearchResponse: Response containing:
4492
+ * success (bool): Whether request succeeded
4493
+ * data (List[FirecrawlDocument]): Search results
4494
+ * warning (Optional[str]): Warning message if any
4495
+ * error (Optional[str]): Error message if any
4496
+
4497
+ Raises:
4498
+ Exception: If search fails or response cannot be parsed
4499
+ """
4500
+ # Build search parameters
4501
+ search_params = {}
4502
+ if params:
4503
+ if isinstance(params, dict):
4504
+ search_params.update(params)
4505
+ else:
4506
+ search_params.update(params.dict(exclude_none=True))
4507
+
4508
+ # Add individual parameters
4509
+ if limit is not None:
4510
+ search_params['limit'] = limit
4511
+ if tbs is not None:
4512
+ search_params['tbs'] = tbs
4513
+ if filter is not None:
4514
+ search_params['filter'] = filter
4515
+ if lang is not None:
4516
+ search_params['lang'] = lang
4517
+ if country is not None:
4518
+ search_params['country'] = country
4519
+ if location is not None:
4520
+ search_params['location'] = location
4521
+ if timeout is not None:
4522
+ search_params['timeout'] = timeout
4523
+ if scrape_options is not None:
4524
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4525
+
4526
+ # Add any additional kwargs
4527
+ search_params.update(kwargs)
4528
+
4529
+ # Create final params object
4530
+ final_params = SearchParams(query=query, **search_params)
4531
+ params_dict = final_params.dict(exclude_none=True)
4532
+ params_dict['origin'] = f"python-sdk@{version}"
4533
+
4534
+ return await self._async_post_request(
4535
+ f"{self.api_url}/v1/search",
4536
+ params_dict,
4537
+ {"Authorization": f"Bearer {self.api_key}"}
4538
+ )
4539
+
4540
+ class AsyncCrawlWatcher(CrawlWatcher):
4541
+ """
4542
+ Async version of CrawlWatcher that properly handles async operations.
4543
+ """
4544
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4545
+ super().__init__(id, app)
4546
+
4547
+ async def connect(self) -> None:
4548
+ """
4549
+ Establishes async WebSocket connection and starts listening for messages.
4550
+ """
4551
+ async with websockets.connect(
4552
+ self.ws_url,
4553
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4554
+ ) as websocket:
4555
+ await self._listen(websocket)
4556
+
4557
+ async def _listen(self, websocket) -> None:
4558
+ """
4559
+ Listens for incoming WebSocket messages and handles them asynchronously.
4560
+
4561
+ Args:
4562
+ websocket: The WebSocket connection object
4563
+ """
4564
+ async for message in websocket:
4565
+ msg = json.loads(message)
4566
+ await self._handle_message(msg)
4567
+
4568
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4569
+ """
4570
+ Handles incoming WebSocket messages based on their type asynchronously.
4571
+
4572
+ Args:
4573
+ msg (Dict[str, Any]): The message to handle
4574
+ """
4575
+ if msg['type'] == 'done':
4576
+ self.status = 'completed'
4577
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4578
+ elif msg['type'] == 'error':
4579
+ self.status = 'failed'
4580
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4581
+ elif msg['type'] == 'catchup':
4582
+ self.status = msg['data']['status']
4583
+ self.data.extend(msg['data'].get('data', []))
4584
+ for doc in self.data:
4585
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4586
+ elif msg['type'] == 'document':
4587
+ self.data.append(msg['data'])
4588
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4589
+
4590
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4591
+ """
4592
+ Handle errors from async API responses.
4593
+ """
4594
+ try:
4595
+ error_data = await response.json()
4596
+ error_message = error_data.get('error', 'No error message provided.')
4597
+ error_details = error_data.get('details', 'No additional error details provided.')
4598
+ except:
4599
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4600
+
4601
+ # Use the app's method to get the error message
4602
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4603
+
4604
+ raise aiohttp.ClientError(message)
4605
+
4606
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4607
+ """
4608
+ Generate a standardized error message based on HTTP status code for async operations.
4609
+
4610
+ Args:
4611
+ status_code (int): The HTTP status code from the response
4612
+ action (str): Description of the action that was being performed
4613
+ error_message (str): The error message from the API response
4614
+ error_details (str): Additional error details from the API response
4615
+
4616
+ Returns:
4617
+ str: A formatted error message
4618
+ """
4619
+ return self._get_error_message(status_code, action, error_message, error_details)