firecrawl-py 2.4.0__py3-none-any.whl → 2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

@@ -0,0 +1,4372 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
47
+
48
+ logger : logging.Logger = logging.getLogger("firecrawl")
49
+
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+
100
+ class ChangeTrackingData(pydantic.BaseModel):
101
+ """
102
+ Data for the change tracking format.
103
+ """
104
+ previousScrapeAt: Optional[str] = None
105
+ changeStatus: str # "new" | "same" | "changed" | "removed"
106
+ visibility: str # "visible" | "hidden"
107
+ diff: Optional[Dict[str, Any]] = None
108
+ json: Optional[Any] = None
109
+
110
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
+ """Document retrieved or processed by Firecrawl."""
112
+ url: Optional[str] = None
113
+ markdown: Optional[str] = None
114
+ html: Optional[str] = None
115
+ rawHtml: Optional[str] = None
116
+ links: Optional[List[str]] = None
117
+ extract: Optional[T] = None
118
+ json: Optional[T] = None
119
+ screenshot: Optional[str] = None
120
+ metadata: Optional[Any] = None
121
+ actions: Optional[ActionsResult] = None
122
+ title: Optional[str] = None # v1 search only
123
+ description: Optional[str] = None # v1 search only
124
+ changeTracking: Optional[ChangeTrackingData] = None
125
+
126
+ class LocationConfig(pydantic.BaseModel):
127
+ """Location configuration for scraping."""
128
+ country: Optional[str] = None
129
+ languages: Optional[List[str]] = None
130
+
131
+ class WebhookConfig(pydantic.BaseModel):
132
+ """Configuration for webhooks."""
133
+ url: str
134
+ headers: Optional[Dict[str, str]] = None
135
+ metadata: Optional[Dict[str, str]] = None
136
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
+
138
+ class ScrapeOptions(pydantic.BaseModel):
139
+ """Parameters for scraping operations."""
140
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
141
+ headers: Optional[Dict[str, str]] = None
142
+ includeTags: Optional[List[str]] = None
143
+ excludeTags: Optional[List[str]] = None
144
+ onlyMainContent: Optional[bool] = None
145
+ waitFor: Optional[int] = None
146
+ timeout: Optional[int] = None
147
+ location: Optional[LocationConfig] = None
148
+ mobile: Optional[bool] = None
149
+ skipTlsVerification: Optional[bool] = None
150
+ removeBase64Images: Optional[bool] = None
151
+ blockAds: Optional[bool] = None
152
+ proxy: Optional[Literal["basic", "stealth"]] = None
153
+
154
+ class WaitAction(pydantic.BaseModel):
155
+ """Wait action to perform during scraping."""
156
+ type: Literal["wait"]
157
+ milliseconds: int
158
+ selector: Optional[str] = None
159
+
160
+ class ScreenshotAction(pydantic.BaseModel):
161
+ """Screenshot action to perform during scraping."""
162
+ type: Literal["screenshot"]
163
+ fullPage: Optional[bool] = None
164
+
165
+ class ClickAction(pydantic.BaseModel):
166
+ """Click action to perform during scraping."""
167
+ type: Literal["click"]
168
+ selector: str
169
+
170
+ class WriteAction(pydantic.BaseModel):
171
+ """Write action to perform during scraping."""
172
+ type: Literal["write"]
173
+ text: str
174
+
175
+ class PressAction(pydantic.BaseModel):
176
+ """Press action to perform during scraping."""
177
+ type: Literal["press"]
178
+ key: str
179
+
180
+ class ScrollAction(pydantic.BaseModel):
181
+ """Scroll action to perform during scraping."""
182
+ type: Literal["scroll"]
183
+ direction: Literal["up", "down"]
184
+ selector: Optional[str] = None
185
+
186
+ class ScrapeAction(pydantic.BaseModel):
187
+ """Scrape action to perform during scraping."""
188
+ type: Literal["scrape"]
189
+
190
+ class ExecuteJavascriptAction(pydantic.BaseModel):
191
+ """Execute javascript action to perform during scraping."""
192
+ type: Literal["executeJavascript"]
193
+ script: str
194
+
195
+
196
+ class ExtractAgent(pydantic.BaseModel):
197
+ """Configuration for the agent in extract operations."""
198
+ model: Literal["FIRE-1"] = "FIRE-1"
199
+
200
+ class JsonConfig(pydantic.BaseModel):
201
+ """Configuration for extraction."""
202
+ prompt: Optional[str] = None
203
+ schema: Optional[Any] = None
204
+ systemPrompt: Optional[str] = None
205
+ agent: Optional[ExtractAgent] = None
206
+
207
+ class ScrapeParams(ScrapeOptions):
208
+ """Parameters for scraping operations."""
209
+ extract: Optional[JsonConfig] = None
210
+ jsonOptions: Optional[JsonConfig] = None
211
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
212
+ agent: Optional[AgentOptions] = None
213
+ webhook: Optional[WebhookConfig] = None
214
+
215
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
216
+ """Response from scraping operations."""
217
+ success: bool = True
218
+ warning: Optional[str] = None
219
+ error: Optional[str] = None
220
+
221
+ class BatchScrapeResponse(pydantic.BaseModel):
222
+ """Response from batch scrape operations."""
223
+ id: Optional[str] = None
224
+ url: Optional[str] = None
225
+ success: bool = True
226
+ error: Optional[str] = None
227
+ invalidURLs: Optional[List[str]] = None
228
+
229
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
230
+ """Response from batch scrape status checks."""
231
+ success: bool = True
232
+ status: Literal["scraping", "completed", "failed", "cancelled"]
233
+ completed: int
234
+ total: int
235
+ creditsUsed: int
236
+ expiresAt: datetime
237
+ next: Optional[str] = None
238
+ data: List[FirecrawlDocument]
239
+
240
+ class CrawlParams(pydantic.BaseModel):
241
+ """Parameters for crawling operations."""
242
+ includePaths: Optional[List[str]] = None
243
+ excludePaths: Optional[List[str]] = None
244
+ maxDepth: Optional[int] = None
245
+ maxDiscoveryDepth: Optional[int] = None
246
+ limit: Optional[int] = None
247
+ allowBackwardLinks: Optional[bool] = None
248
+ allowExternalLinks: Optional[bool] = None
249
+ ignoreSitemap: Optional[bool] = None
250
+ scrapeOptions: Optional[ScrapeOptions] = None
251
+ webhook: Optional[Union[str, WebhookConfig]] = None
252
+ deduplicateSimilarURLs: Optional[bool] = None
253
+ ignoreQueryParameters: Optional[bool] = None
254
+ regexOnFullURL: Optional[bool] = None
255
+
256
+ class CrawlResponse(pydantic.BaseModel):
257
+ """Response from crawling operations."""
258
+ id: Optional[str] = None
259
+ url: Optional[str] = None
260
+ success: bool = True
261
+ error: Optional[str] = None
262
+
263
+ class CrawlStatusResponse(pydantic.BaseModel):
264
+ """Response from crawl status checks."""
265
+ success: bool = True
266
+ status: Literal["scraping", "completed", "failed", "cancelled"]
267
+ completed: int
268
+ total: int
269
+ creditsUsed: int
270
+ expiresAt: datetime
271
+ next: Optional[str] = None
272
+ data: List[FirecrawlDocument]
273
+
274
+ class CrawlErrorsResponse(pydantic.BaseModel):
275
+ """Response from crawl/batch scrape error monitoring."""
276
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
277
+ robotsBlocked: List[str]
278
+
279
+ class MapParams(pydantic.BaseModel):
280
+ """Parameters for mapping operations."""
281
+ search: Optional[str] = None
282
+ ignoreSitemap: Optional[bool] = None
283
+ includeSubdomains: Optional[bool] = None
284
+ sitemapOnly: Optional[bool] = None
285
+ limit: Optional[int] = None
286
+ timeout: Optional[int] = None
287
+
288
+ class MapResponse(pydantic.BaseModel):
289
+ """Response from mapping operations."""
290
+ success: bool = True
291
+ links: Optional[List[str]] = None
292
+ error: Optional[str] = None
293
+
294
+ class ExtractParams(pydantic.BaseModel):
295
+ """Parameters for extracting information from URLs."""
296
+ prompt: Optional[str] = None
297
+ schema: Optional[Any] = None
298
+ systemPrompt: Optional[str] = None
299
+ allowExternalLinks: Optional[bool] = None
300
+ enableWebSearch: Optional[bool] = None
301
+ includeSubdomains: Optional[bool] = None
302
+ origin: Optional[str] = None
303
+ showSources: Optional[bool] = None
304
+ scrapeOptions: Optional[ScrapeOptions] = None
305
+
306
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
307
+ """Response from extract operations."""
308
+ id: Optional[str] = None
309
+ status: Optional[Literal["processing", "completed", "failed"]] = None
310
+ expiresAt: Optional[datetime] = None
311
+ success: bool = True
312
+ data: Optional[T] = None
313
+ error: Optional[str] = None
314
+ warning: Optional[str] = None
315
+ sources: Optional[List[str]] = None
316
+
317
+ class SearchParams(pydantic.BaseModel):
318
+ query: str
319
+ limit: Optional[int] = 5
320
+ tbs: Optional[str] = None
321
+ filter: Optional[str] = None
322
+ lang: Optional[str] = "en"
323
+ country: Optional[str] = "us"
324
+ location: Optional[str] = None
325
+ origin: Optional[str] = "api"
326
+ timeout: Optional[int] = 60000
327
+ scrapeOptions: Optional[ScrapeOptions] = None
328
+
329
+ class SearchResponse(pydantic.BaseModel):
330
+ """Response from search operations."""
331
+ success: bool = True
332
+ data: List[FirecrawlDocument]
333
+ warning: Optional[str] = None
334
+ error: Optional[str] = None
335
+
336
+ class GenerateLLMsTextParams(pydantic.BaseModel):
337
+ """
338
+ Parameters for the LLMs.txt generation operation.
339
+ """
340
+ maxUrls: Optional[int] = 10
341
+ showFullText: Optional[bool] = False
342
+ __experimental_stream: Optional[bool] = None
343
+
344
+ class DeepResearchParams(pydantic.BaseModel):
345
+ """
346
+ Parameters for the deep research operation.
347
+ """
348
+ maxDepth: Optional[int] = 7
349
+ timeLimit: Optional[int] = 270
350
+ maxUrls: Optional[int] = 20
351
+ analysisPrompt: Optional[str] = None
352
+ systemPrompt: Optional[str] = None
353
+ __experimental_streamSteps: Optional[bool] = None
354
+
355
+ class DeepResearchResponse(pydantic.BaseModel):
356
+ """
357
+ Response from the deep research operation.
358
+ """
359
+ success: bool
360
+ id: str
361
+ error: Optional[str] = None
362
+
363
+ class DeepResearchStatusResponse(pydantic.BaseModel):
364
+ """
365
+ Status response from the deep research operation.
366
+ """
367
+ success: bool
368
+ data: Optional[Dict[str, Any]] = None
369
+ status: str
370
+ error: Optional[str] = None
371
+ expiresAt: str
372
+ currentDepth: int
373
+ maxDepth: int
374
+ activities: List[Dict[str, Any]]
375
+ sources: List[Dict[str, Any]]
376
+ summaries: List[str]
377
+
378
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
379
+ """Response from LLMs.txt generation operations."""
380
+ success: bool = True
381
+ id: str
382
+ error: Optional[str] = None
383
+
384
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
385
+ llmstxt: str
386
+ llmsfulltxt: Optional[str] = None
387
+
388
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
389
+ """Status response from LLMs.txt generation operations."""
390
+ success: bool = True
391
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
392
+ status: Literal["processing", "completed", "failed"]
393
+ error: Optional[str] = None
394
+ expiresAt: str
395
+
396
+ class SearchResponse(pydantic.BaseModel):
397
+ """
398
+ Response from the search operation.
399
+ """
400
+ success: bool
401
+ data: List[Dict[str, Any]]
402
+ warning: Optional[str] = None
403
+ error: Optional[str] = None
404
+
405
+ class ExtractParams(pydantic.BaseModel):
406
+ """
407
+ Parameters for the extract operation.
408
+ """
409
+ prompt: Optional[str] = None
410
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
411
+ system_prompt: Optional[str] = None
412
+ allow_external_links: Optional[bool] = False
413
+ enable_web_search: Optional[bool] = False
414
+ # Just for backwards compatibility
415
+ enableWebSearch: Optional[bool] = False
416
+ show_sources: Optional[bool] = False
417
+ agent: Optional[Dict[str, Any]] = None
418
+
419
+ class FirecrawlApp:
420
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
421
+ """
422
+ Initialize the FirecrawlApp instance with API key, API URL.
423
+
424
+ Args:
425
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
426
+ api_url (Optional[str]): Base URL for the Firecrawl API.
427
+ """
428
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
429
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
430
+
431
+ # Only require API key when using cloud service
432
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
433
+ logger.warning("No API key provided for cloud service")
434
+ raise ValueError('No API key provided')
435
+
436
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
437
+
438
+ def scrape_url(
439
+ self,
440
+ url: str,
441
+ *,
442
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
443
+ include_tags: Optional[List[str]] = None,
444
+ exclude_tags: Optional[List[str]] = None,
445
+ only_main_content: Optional[bool] = None,
446
+ wait_for: Optional[int] = None,
447
+ timeout: Optional[int] = None,
448
+ location: Optional[LocationConfig] = None,
449
+ mobile: Optional[bool] = None,
450
+ skip_tls_verification: Optional[bool] = None,
451
+ remove_base64_images: Optional[bool] = None,
452
+ block_ads: Optional[bool] = None,
453
+ proxy: Optional[Literal["basic", "stealth"]] = None,
454
+ extract: Optional[JsonConfig] = None,
455
+ json_options: Optional[JsonConfig] = None,
456
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
457
+ **kwargs) -> ScrapeResponse[Any]:
458
+ """
459
+ Scrape and extract content from a URL.
460
+
461
+ Args:
462
+ url (str): Target URL to scrape
463
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
464
+ include_tags (Optional[List[str]]): HTML tags to include
465
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
466
+ only_main_content (Optional[bool]): Extract main content only
467
+ wait_for (Optional[int]): Wait for a specific element to appear
468
+ timeout (Optional[int]): Request timeout (ms)
469
+ location (Optional[LocationConfig]): Location configuration
470
+ mobile (Optional[bool]): Use mobile user agent
471
+ skip_tls_verification (Optional[bool]): Skip TLS verification
472
+ remove_base64_images (Optional[bool]): Remove base64 images
473
+ block_ads (Optional[bool]): Block ads
474
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
475
+ extract (Optional[JsonConfig]): Content extraction settings
476
+ json_options (Optional[JsonConfig]): JSON extraction settings
477
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
478
+
479
+
480
+ Returns:
481
+ ScrapeResponse with:
482
+ * Requested content formats
483
+ * Page metadata
484
+ * Extraction results
485
+ * Success/error status
486
+
487
+ Raises:
488
+ Exception: If scraping fails
489
+ """
490
+ headers = self._prepare_headers()
491
+
492
+ # Build scrape parameters
493
+ scrape_params = {
494
+ 'url': url,
495
+ 'origin': f"python-sdk@{version}"
496
+ }
497
+
498
+ # Add optional parameters if provided
499
+ if formats:
500
+ scrape_params['formats'] = formats
501
+ if include_tags:
502
+ scrape_params['includeTags'] = include_tags
503
+ if exclude_tags:
504
+ scrape_params['excludeTags'] = exclude_tags
505
+ if only_main_content is not None:
506
+ scrape_params['onlyMainContent'] = only_main_content
507
+ if wait_for:
508
+ scrape_params['waitFor'] = wait_for
509
+ if timeout:
510
+ scrape_params['timeout'] = timeout
511
+ if location:
512
+ scrape_params['location'] = location.dict(exclude_none=True)
513
+ if mobile is not None:
514
+ scrape_params['mobile'] = mobile
515
+ if skip_tls_verification is not None:
516
+ scrape_params['skipTlsVerification'] = skip_tls_verification
517
+ if remove_base64_images is not None:
518
+ scrape_params['removeBase64Images'] = remove_base64_images
519
+ if block_ads is not None:
520
+ scrape_params['blockAds'] = block_ads
521
+ if proxy:
522
+ scrape_params['proxy'] = proxy
523
+ if extract:
524
+ if hasattr(extract.schema, 'schema'):
525
+ extract.schema = extract.schema.schema()
526
+ scrape_params['extract'] = extract.dict(exclude_none=True)
527
+ if json_options:
528
+ if hasattr(json_options.schema, 'schema'):
529
+ json_options.schema = json_options.schema.schema()
530
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
531
+ if actions:
532
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
533
+ scrape_params.update(kwargs)
534
+
535
+ # Make request
536
+ response = requests.post(
537
+ f'{self.api_url}/v1/scrape',
538
+ headers=headers,
539
+ json=scrape_params,
540
+ timeout=(timeout + 5000 if timeout else None)
541
+ )
542
+
543
+ if response.status_code == 200:
544
+ try:
545
+ response_json = response.json()
546
+ if response_json.get('success') and 'data' in response_json:
547
+ return ScrapeResponse(**response_json['data'])
548
+ elif "error" in response_json:
549
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
550
+ else:
551
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
552
+ except ValueError:
553
+ raise Exception('Failed to parse Firecrawl response as JSON.')
554
+ else:
555
+ self._handle_error(response, 'scrape URL')
556
+
557
+ def search(
558
+ self,
559
+ query: str,
560
+ *,
561
+ limit: Optional[int] = None,
562
+ tbs: Optional[str] = None,
563
+ filter: Optional[str] = None,
564
+ lang: Optional[str] = None,
565
+ country: Optional[str] = None,
566
+ location: Optional[str] = None,
567
+ timeout: Optional[int] = None,
568
+ scrape_options: Optional[ScrapeOptions] = None,
569
+ **kwargs) -> SearchResponse:
570
+ """
571
+ Search for content using Firecrawl.
572
+
573
+ Args:
574
+ query (str): Search query string
575
+ limit (Optional[int]): Max results (default: 5)
576
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
577
+ filter (Optional[str]): Custom result filter
578
+ lang (Optional[str]): Language code (default: "en")
579
+ country (Optional[str]): Country code (default: "us")
580
+ location (Optional[str]): Geo-targeting
581
+ timeout (Optional[int]): Request timeout in milliseconds
582
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
583
+ **kwargs: Additional keyword arguments for future compatibility
584
+
585
+ Returns:
586
+ SearchResponse: Response containing:
587
+ * success (bool): Whether request succeeded
588
+ * data (List[FirecrawlDocument]): Search results
589
+ * warning (Optional[str]): Warning message if any
590
+ * error (Optional[str]): Error message if any
591
+
592
+ Raises:
593
+ Exception: If search fails or response cannot be parsed
594
+ """
595
+ # Validate any additional kwargs
596
+ self._validate_kwargs(kwargs, "search")
597
+
598
+ # Build search parameters
599
+ search_params = {}
600
+
601
+ # Add individual parameters
602
+ if limit is not None:
603
+ search_params['limit'] = limit
604
+ if tbs is not None:
605
+ search_params['tbs'] = tbs
606
+ if filter is not None:
607
+ search_params['filter'] = filter
608
+ if lang is not None:
609
+ search_params['lang'] = lang
610
+ if country is not None:
611
+ search_params['country'] = country
612
+ if location is not None:
613
+ search_params['location'] = location
614
+ if timeout is not None:
615
+ search_params['timeout'] = timeout
616
+ if scrape_options is not None:
617
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
618
+
619
+ # Add any additional kwargs
620
+ search_params.update(kwargs)
621
+
622
+ # Create final params object
623
+ final_params = SearchParams(query=query, **search_params)
624
+ params_dict = final_params.dict(exclude_none=True)
625
+ params_dict['origin'] = f"python-sdk@{version}"
626
+
627
+ # Make request
628
+ response = requests.post(
629
+ f"{self.api_url}/v1/search",
630
+ headers={"Authorization": f"Bearer {self.api_key}"},
631
+ json=params_dict
632
+ )
633
+
634
+ if response.status_code == 200:
635
+ try:
636
+ response_json = response.json()
637
+ if response_json.get('success') and 'data' in response_json:
638
+ return SearchResponse(**response_json)
639
+ elif "error" in response_json:
640
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
641
+ else:
642
+ raise Exception(f'Search failed. Error: {response_json}')
643
+ except ValueError:
644
+ raise Exception('Failed to parse Firecrawl response as JSON.')
645
+ else:
646
+ self._handle_error(response, 'search')
647
+
648
+ def crawl_url(
649
+ self,
650
+ url: str,
651
+ *,
652
+ include_paths: Optional[List[str]] = None,
653
+ exclude_paths: Optional[List[str]] = None,
654
+ max_depth: Optional[int] = None,
655
+ max_discovery_depth: Optional[int] = None,
656
+ limit: Optional[int] = None,
657
+ allow_backward_links: Optional[bool] = None,
658
+ allow_external_links: Optional[bool] = None,
659
+ ignore_sitemap: Optional[bool] = None,
660
+ scrape_options: Optional[ScrapeOptions] = None,
661
+ webhook: Optional[Union[str, WebhookConfig]] = None,
662
+ deduplicate_similar_urls: Optional[bool] = None,
663
+ ignore_query_parameters: Optional[bool] = None,
664
+ regex_on_full_url: Optional[bool] = None,
665
+ poll_interval: Optional[int] = 2,
666
+ idempotency_key: Optional[str] = None,
667
+ **kwargs
668
+ ) -> CrawlStatusResponse:
669
+ """
670
+ Crawl a website starting from a URL.
671
+
672
+ Args:
673
+ url (str): Target URL to start crawling from
674
+ include_paths (Optional[List[str]]): Patterns of URLs to include
675
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
676
+ max_depth (Optional[int]): Maximum crawl depth
677
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
678
+ limit (Optional[int]): Maximum pages to crawl
679
+ allow_backward_links (Optional[bool]): Follow parent directory links
680
+ allow_external_links (Optional[bool]): Follow external domain links
681
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
682
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
683
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
684
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
685
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
686
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
687
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
688
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
689
+ **kwargs: Additional parameters to pass to the API
690
+
691
+ Returns:
692
+ CrawlStatusResponse with:
693
+ * Crawling status and progress
694
+ * Crawled page contents
695
+ * Success/error information
696
+
697
+ Raises:
698
+ Exception: If crawl fails
699
+ """
700
+ # Validate any additional kwargs
701
+ self._validate_kwargs(kwargs, "crawl_url")
702
+
703
+ crawl_params = {}
704
+
705
+ # Add individual parameters
706
+ if include_paths is not None:
707
+ crawl_params['includePaths'] = include_paths
708
+ if exclude_paths is not None:
709
+ crawl_params['excludePaths'] = exclude_paths
710
+ if max_depth is not None:
711
+ crawl_params['maxDepth'] = max_depth
712
+ if max_discovery_depth is not None:
713
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
714
+ if limit is not None:
715
+ crawl_params['limit'] = limit
716
+ if allow_backward_links is not None:
717
+ crawl_params['allowBackwardLinks'] = allow_backward_links
718
+ if allow_external_links is not None:
719
+ crawl_params['allowExternalLinks'] = allow_external_links
720
+ if ignore_sitemap is not None:
721
+ crawl_params['ignoreSitemap'] = ignore_sitemap
722
+ if scrape_options is not None:
723
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
724
+ if webhook is not None:
725
+ crawl_params['webhook'] = webhook
726
+ if deduplicate_similar_urls is not None:
727
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
728
+ if ignore_query_parameters is not None:
729
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
730
+ if regex_on_full_url is not None:
731
+ crawl_params['regexOnFullURL'] = regex_on_full_url
732
+
733
+ # Add any additional kwargs
734
+ crawl_params.update(kwargs)
735
+
736
+ # Create final params object
737
+ final_params = CrawlParams(**crawl_params)
738
+ params_dict = final_params.dict(exclude_none=True)
739
+ params_dict['url'] = url
740
+ params_dict['origin'] = f"python-sdk@{version}"
741
+
742
+ # Make request
743
+ headers = self._prepare_headers(idempotency_key)
744
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
745
+
746
+ if response.status_code == 200:
747
+ try:
748
+ id = response.json().get('id')
749
+ except:
750
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
751
+ return self._monitor_job_status(id, headers, poll_interval)
752
+ else:
753
+ self._handle_error(response, 'start crawl job')
754
+
755
+ def async_crawl_url(
756
+ self,
757
+ url: str,
758
+ *,
759
+ include_paths: Optional[List[str]] = None,
760
+ exclude_paths: Optional[List[str]] = None,
761
+ max_depth: Optional[int] = None,
762
+ max_discovery_depth: Optional[int] = None,
763
+ limit: Optional[int] = None,
764
+ allow_backward_links: Optional[bool] = None,
765
+ allow_external_links: Optional[bool] = None,
766
+ ignore_sitemap: Optional[bool] = None,
767
+ scrape_options: Optional[ScrapeOptions] = None,
768
+ webhook: Optional[Union[str, WebhookConfig]] = None,
769
+ deduplicate_similar_urls: Optional[bool] = None,
770
+ ignore_query_parameters: Optional[bool] = None,
771
+ regex_on_full_url: Optional[bool] = None,
772
+ idempotency_key: Optional[str] = None,
773
+ **kwargs
774
+ ) -> CrawlResponse:
775
+ """
776
+ Start an asynchronous crawl job.
777
+
778
+ Args:
779
+ url (str): Target URL to start crawling from
780
+ include_paths (Optional[List[str]]): Patterns of URLs to include
781
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
782
+ max_depth (Optional[int]): Maximum crawl depth
783
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
784
+ limit (Optional[int]): Maximum pages to crawl
785
+ allow_backward_links (Optional[bool]): Follow parent directory links
786
+ allow_external_links (Optional[bool]): Follow external domain links
787
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
788
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
789
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
790
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
791
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
792
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
793
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
794
+ **kwargs: Additional parameters to pass to the API
795
+
796
+ Returns:
797
+ CrawlResponse with:
798
+ * success - Whether crawl started successfully
799
+ * id - Unique identifier for the crawl job
800
+ * url - Status check URL for the crawl
801
+ * error - Error message if start failed
802
+
803
+ Raises:
804
+ Exception: If crawl initiation fails
805
+ """
806
+ # Validate any additional kwargs
807
+ self._validate_kwargs(kwargs, "async_crawl_url")
808
+
809
+ crawl_params = {}
810
+
811
+ # Add individual parameters
812
+ if include_paths is not None:
813
+ crawl_params['includePaths'] = include_paths
814
+ if exclude_paths is not None:
815
+ crawl_params['excludePaths'] = exclude_paths
816
+ if max_depth is not None:
817
+ crawl_params['maxDepth'] = max_depth
818
+ if max_discovery_depth is not None:
819
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
820
+ if limit is not None:
821
+ crawl_params['limit'] = limit
822
+ if allow_backward_links is not None:
823
+ crawl_params['allowBackwardLinks'] = allow_backward_links
824
+ if allow_external_links is not None:
825
+ crawl_params['allowExternalLinks'] = allow_external_links
826
+ if ignore_sitemap is not None:
827
+ crawl_params['ignoreSitemap'] = ignore_sitemap
828
+ if scrape_options is not None:
829
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
830
+ if webhook is not None:
831
+ crawl_params['webhook'] = webhook
832
+ if deduplicate_similar_urls is not None:
833
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
834
+ if ignore_query_parameters is not None:
835
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
836
+ if regex_on_full_url is not None:
837
+ crawl_params['regexOnFullURL'] = regex_on_full_url
838
+
839
+ # Add any additional kwargs
840
+ crawl_params.update(kwargs)
841
+
842
+ # Create final params object
843
+ final_params = CrawlParams(**crawl_params)
844
+ params_dict = final_params.dict(exclude_none=True)
845
+ params_dict['url'] = url
846
+ params_dict['origin'] = f"python-sdk@{version}"
847
+
848
+ # Make request
849
+ headers = self._prepare_headers(idempotency_key)
850
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
851
+
852
+ if response.status_code == 200:
853
+ try:
854
+ return CrawlResponse(**response.json())
855
+ except:
856
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
857
+ else:
858
+ self._handle_error(response, 'start crawl job')
859
+
860
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
861
+ """
862
+ Check the status and results of a crawl job.
863
+
864
+ Args:
865
+ id: Unique identifier for the crawl job
866
+
867
+ Returns:
868
+ CrawlStatusResponse containing:
869
+
870
+ Status Information:
871
+ * status - Current state (scraping/completed/failed/cancelled)
872
+ * completed - Number of pages crawled
873
+ * total - Total pages to crawl
874
+ * creditsUsed - API credits consumed
875
+ * expiresAt - Data expiration timestamp
876
+
877
+ Results:
878
+ * data - List of crawled documents
879
+ * next - URL for next page of results (if paginated)
880
+ * success - Whether status check succeeded
881
+ * error - Error message if failed
882
+
883
+ Raises:
884
+ Exception: If status check fails
885
+ """
886
+ endpoint = f'/v1/crawl/{id}'
887
+
888
+ headers = self._prepare_headers()
889
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
890
+ if response.status_code == 200:
891
+ try:
892
+ status_data = response.json()
893
+ except:
894
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
895
+ if status_data['status'] == 'completed':
896
+ if 'data' in status_data:
897
+ data = status_data['data']
898
+ while 'next' in status_data:
899
+ if len(status_data['data']) == 0:
900
+ break
901
+ next_url = status_data.get('next')
902
+ if not next_url:
903
+ logger.warning("Expected 'next' URL is missing.")
904
+ break
905
+ try:
906
+ status_response = self._get_request(next_url, headers)
907
+ if status_response.status_code != 200:
908
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
909
+ break
910
+ try:
911
+ next_data = status_response.json()
912
+ except:
913
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
914
+ data.extend(next_data.get('data', []))
915
+ status_data = next_data
916
+ except Exception as e:
917
+ logger.error(f"Error during pagination request: {e}")
918
+ break
919
+ status_data['data'] = data
920
+
921
+ response = {
922
+ 'status': status_data.get('status'),
923
+ 'total': status_data.get('total'),
924
+ 'completed': status_data.get('completed'),
925
+ 'creditsUsed': status_data.get('creditsUsed'),
926
+ 'expiresAt': status_data.get('expiresAt'),
927
+ 'data': status_data.get('data')
928
+ }
929
+
930
+ if 'error' in status_data:
931
+ response['error'] = status_data['error']
932
+
933
+ if 'next' in status_data:
934
+ response['next'] = status_data['next']
935
+
936
+ return CrawlStatusResponse(
937
+ success=False if 'error' in status_data else True,
938
+ **response
939
+ )
940
+ else:
941
+ self._handle_error(response, 'check crawl status')
942
+
943
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
944
+ """
945
+ Returns information about crawl errors.
946
+
947
+ Args:
948
+ id (str): The ID of the crawl job
949
+
950
+ Returns:
951
+ CrawlErrorsResponse containing:
952
+ * errors (List[Dict[str, str]]): List of errors with fields:
953
+ - id (str): Error ID
954
+ - timestamp (str): When the error occurred
955
+ - url (str): URL that caused the error
956
+ - error (str): Error message
957
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
958
+
959
+ Raises:
960
+ Exception: If error check fails
961
+ """
962
+ headers = self._prepare_headers()
963
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
964
+ if response.status_code == 200:
965
+ try:
966
+ return CrawlErrorsResponse(**response.json())
967
+ except:
968
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
969
+ else:
970
+ self._handle_error(response, "check crawl errors")
971
+
972
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
973
+ """
974
+ Cancel an asynchronous crawl job.
975
+
976
+ Args:
977
+ id (str): The ID of the crawl job to cancel
978
+
979
+ Returns:
980
+ Dict[str, Any] containing:
981
+ * success (bool): Whether cancellation was successful
982
+ * error (str, optional): Error message if cancellation failed
983
+
984
+ Raises:
985
+ Exception: If cancellation fails
986
+ """
987
+ headers = self._prepare_headers()
988
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
989
+ if response.status_code == 200:
990
+ try:
991
+ return response.json()
992
+ except:
993
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
994
+ else:
995
+ self._handle_error(response, "cancel crawl job")
996
+
997
+ def crawl_url_and_watch(
998
+ self,
999
+ url: str,
1000
+ *,
1001
+ include_paths: Optional[List[str]] = None,
1002
+ exclude_paths: Optional[List[str]] = None,
1003
+ max_depth: Optional[int] = None,
1004
+ max_discovery_depth: Optional[int] = None,
1005
+ limit: Optional[int] = None,
1006
+ allow_backward_links: Optional[bool] = None,
1007
+ allow_external_links: Optional[bool] = None,
1008
+ ignore_sitemap: Optional[bool] = None,
1009
+ scrape_options: Optional[ScrapeOptions] = None,
1010
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1011
+ deduplicate_similar_urls: Optional[bool] = None,
1012
+ ignore_query_parameters: Optional[bool] = None,
1013
+ regex_on_full_url: Optional[bool] = None,
1014
+ idempotency_key: Optional[str] = None,
1015
+ **kwargs
1016
+ ) -> 'CrawlWatcher':
1017
+ """
1018
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1019
+
1020
+ Args:
1021
+ url (str): Target URL to start crawling from
1022
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1023
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1024
+ max_depth (Optional[int]): Maximum crawl depth
1025
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1026
+ limit (Optional[int]): Maximum pages to crawl
1027
+ allow_backward_links (Optional[bool]): Follow parent directory links
1028
+ allow_external_links (Optional[bool]): Follow external domain links
1029
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1030
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1031
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1032
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1033
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1034
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1035
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1036
+ **kwargs: Additional parameters to pass to the API
1037
+
1038
+ Returns:
1039
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1040
+
1041
+ Raises:
1042
+ Exception: If crawl job fails to start
1043
+ """
1044
+ crawl_response = self.async_crawl_url(
1045
+ url,
1046
+ include_paths=include_paths,
1047
+ exclude_paths=exclude_paths,
1048
+ max_depth=max_depth,
1049
+ max_discovery_depth=max_discovery_depth,
1050
+ limit=limit,
1051
+ allow_backward_links=allow_backward_links,
1052
+ allow_external_links=allow_external_links,
1053
+ ignore_sitemap=ignore_sitemap,
1054
+ scrape_options=scrape_options,
1055
+ webhook=webhook,
1056
+ deduplicate_similar_urls=deduplicate_similar_urls,
1057
+ ignore_query_parameters=ignore_query_parameters,
1058
+ regex_on_full_url=regex_on_full_url,
1059
+ idempotency_key=idempotency_key,
1060
+ **kwargs
1061
+ )
1062
+ if crawl_response.success and crawl_response.id:
1063
+ return CrawlWatcher(crawl_response.id, self)
1064
+ else:
1065
+ raise Exception("Crawl job failed to start")
1066
+
1067
+ def map_url(
1068
+ self,
1069
+ url: str,
1070
+ *,
1071
+ search: Optional[str] = None,
1072
+ ignore_sitemap: Optional[bool] = None,
1073
+ include_subdomains: Optional[bool] = None,
1074
+ sitemap_only: Optional[bool] = None,
1075
+ limit: Optional[int] = None,
1076
+ timeout: Optional[int] = None,
1077
+ **kwargs) -> MapResponse:
1078
+ """
1079
+ Map and discover links from a URL.
1080
+
1081
+ Args:
1082
+ url (str): Target URL to map
1083
+ search (Optional[str]): Filter pattern for URLs
1084
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1085
+ include_subdomains (Optional[bool]): Include subdomain links
1086
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1087
+ limit (Optional[int]): Maximum URLs to return
1088
+ timeout (Optional[int]): Request timeout in milliseconds
1089
+ **kwargs: Additional parameters to pass to the API
1090
+
1091
+ Returns:
1092
+ MapResponse: Response containing:
1093
+ * success (bool): Whether request succeeded
1094
+ * links (List[str]): Discovered URLs
1095
+ * error (Optional[str]): Error message if any
1096
+
1097
+ Raises:
1098
+ Exception: If mapping fails or response cannot be parsed
1099
+ """
1100
+ # Validate any additional kwargs
1101
+ self._validate_kwargs(kwargs, "map_url")
1102
+
1103
+ # Build map parameters
1104
+ map_params = {}
1105
+
1106
+ # Add individual parameters
1107
+ if search is not None:
1108
+ map_params['search'] = search
1109
+ if ignore_sitemap is not None:
1110
+ map_params['ignoreSitemap'] = ignore_sitemap
1111
+ if include_subdomains is not None:
1112
+ map_params['includeSubdomains'] = include_subdomains
1113
+ if sitemap_only is not None:
1114
+ map_params['sitemapOnly'] = sitemap_only
1115
+ if limit is not None:
1116
+ map_params['limit'] = limit
1117
+ if timeout is not None:
1118
+ map_params['timeout'] = timeout
1119
+
1120
+ # Add any additional kwargs
1121
+ map_params.update(kwargs)
1122
+
1123
+ # Create final params object
1124
+ final_params = MapParams(**map_params)
1125
+ params_dict = final_params.dict(exclude_none=True)
1126
+ params_dict['url'] = url
1127
+ params_dict['origin'] = f"python-sdk@{version}"
1128
+
1129
+ # Make request
1130
+ response = requests.post(
1131
+ f"{self.api_url}/v1/map",
1132
+ headers={"Authorization": f"Bearer {self.api_key}"},
1133
+ json=params_dict
1134
+ )
1135
+
1136
+ if response.status_code == 200:
1137
+ try:
1138
+ response_json = response.json()
1139
+ if response_json.get('success') and 'links' in response_json:
1140
+ return MapResponse(**response_json)
1141
+ elif "error" in response_json:
1142
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1143
+ else:
1144
+ raise Exception(f'Map failed. Error: {response_json}')
1145
+ except ValueError:
1146
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1147
+ else:
1148
+ self._handle_error(response, 'map')
1149
+
1150
+ def batch_scrape_urls(
1151
+ self,
1152
+ urls: List[str],
1153
+ *,
1154
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1155
+ headers: Optional[Dict[str, str]] = None,
1156
+ include_tags: Optional[List[str]] = None,
1157
+ exclude_tags: Optional[List[str]] = None,
1158
+ only_main_content: Optional[bool] = None,
1159
+ wait_for: Optional[int] = None,
1160
+ timeout: Optional[int] = None,
1161
+ location: Optional[LocationConfig] = None,
1162
+ mobile: Optional[bool] = None,
1163
+ skip_tls_verification: Optional[bool] = None,
1164
+ remove_base64_images: Optional[bool] = None,
1165
+ block_ads: Optional[bool] = None,
1166
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1167
+ extract: Optional[JsonConfig] = None,
1168
+ json_options: Optional[JsonConfig] = None,
1169
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1170
+ agent: Optional[AgentOptions] = None,
1171
+ poll_interval: Optional[int] = 2,
1172
+ idempotency_key: Optional[str] = None,
1173
+ **kwargs
1174
+ ) -> BatchScrapeStatusResponse:
1175
+ """
1176
+ Batch scrape multiple URLs and monitor until completion.
1177
+
1178
+ Args:
1179
+ urls (List[str]): URLs to scrape
1180
+ formats (Optional[List[Literal]]): Content formats to retrieve
1181
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1182
+ include_tags (Optional[List[str]]): HTML tags to include
1183
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1184
+ only_main_content (Optional[bool]): Extract main content only
1185
+ wait_for (Optional[int]): Wait time in milliseconds
1186
+ timeout (Optional[int]): Request timeout in milliseconds
1187
+ location (Optional[LocationConfig]): Location configuration
1188
+ mobile (Optional[bool]): Use mobile user agent
1189
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1190
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1191
+ block_ads (Optional[bool]): Block advertisements
1192
+ proxy (Optional[Literal]): Proxy type to use
1193
+ extract (Optional[JsonConfig]): Content extraction config
1194
+ json_options (Optional[JsonConfig]): JSON extraction config
1195
+ actions (Optional[List[Union]]): Actions to perform
1196
+ agent (Optional[AgentOptions]): Agent configuration
1197
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1198
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1199
+ **kwargs: Additional parameters to pass to the API
1200
+
1201
+ Returns:
1202
+ BatchScrapeStatusResponse with:
1203
+ * Scraping status and progress
1204
+ * Scraped content for each URL
1205
+ * Success/error information
1206
+
1207
+ Raises:
1208
+ Exception: If batch scrape fails
1209
+ """
1210
+ # Validate any additional kwargs
1211
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1212
+
1213
+ scrape_params = {}
1214
+
1215
+ # Add individual parameters
1216
+ if formats is not None:
1217
+ scrape_params['formats'] = formats
1218
+ if headers is not None:
1219
+ scrape_params['headers'] = headers
1220
+ if include_tags is not None:
1221
+ scrape_params['includeTags'] = include_tags
1222
+ if exclude_tags is not None:
1223
+ scrape_params['excludeTags'] = exclude_tags
1224
+ if only_main_content is not None:
1225
+ scrape_params['onlyMainContent'] = only_main_content
1226
+ if wait_for is not None:
1227
+ scrape_params['waitFor'] = wait_for
1228
+ if timeout is not None:
1229
+ scrape_params['timeout'] = timeout
1230
+ if location is not None:
1231
+ scrape_params['location'] = location.dict(exclude_none=True)
1232
+ if mobile is not None:
1233
+ scrape_params['mobile'] = mobile
1234
+ if skip_tls_verification is not None:
1235
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1236
+ if remove_base64_images is not None:
1237
+ scrape_params['removeBase64Images'] = remove_base64_images
1238
+ if block_ads is not None:
1239
+ scrape_params['blockAds'] = block_ads
1240
+ if proxy is not None:
1241
+ scrape_params['proxy'] = proxy
1242
+ if extract is not None:
1243
+ if hasattr(extract.schema, 'schema'):
1244
+ extract.schema = extract.schema.schema()
1245
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1246
+ if json_options is not None:
1247
+ if hasattr(json_options.schema, 'schema'):
1248
+ json_options.schema = json_options.schema.schema()
1249
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1250
+ if actions is not None:
1251
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1252
+ if agent is not None:
1253
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1254
+
1255
+ # Add any additional kwargs
1256
+ scrape_params.update(kwargs)
1257
+
1258
+ # Create final params object
1259
+ final_params = ScrapeParams(**scrape_params)
1260
+ params_dict = final_params.dict(exclude_none=True)
1261
+ params_dict['urls'] = urls
1262
+ params_dict['origin'] = f"python-sdk@{version}"
1263
+
1264
+ # Make request
1265
+ headers = self._prepare_headers(idempotency_key)
1266
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1267
+
1268
+ if response.status_code == 200:
1269
+ try:
1270
+ id = response.json().get('id')
1271
+ except:
1272
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1273
+ return self._monitor_job_status(id, headers, poll_interval)
1274
+ else:
1275
+ self._handle_error(response, 'start batch scrape job')
1276
+
1277
+ def async_batch_scrape_urls(
1278
+ self,
1279
+ urls: List[str],
1280
+ *,
1281
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1282
+ headers: Optional[Dict[str, str]] = None,
1283
+ include_tags: Optional[List[str]] = None,
1284
+ exclude_tags: Optional[List[str]] = None,
1285
+ only_main_content: Optional[bool] = None,
1286
+ wait_for: Optional[int] = None,
1287
+ timeout: Optional[int] = None,
1288
+ location: Optional[LocationConfig] = None,
1289
+ mobile: Optional[bool] = None,
1290
+ skip_tls_verification: Optional[bool] = None,
1291
+ remove_base64_images: Optional[bool] = None,
1292
+ block_ads: Optional[bool] = None,
1293
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1294
+ extract: Optional[JsonConfig] = None,
1295
+ json_options: Optional[JsonConfig] = None,
1296
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1297
+ agent: Optional[AgentOptions] = None,
1298
+ idempotency_key: Optional[str] = None,
1299
+ **kwargs
1300
+ ) -> BatchScrapeResponse:
1301
+ """
1302
+ Initiate a batch scrape job asynchronously.
1303
+
1304
+ Args:
1305
+ urls (List[str]): URLs to scrape
1306
+ formats (Optional[List[Literal]]): Content formats to retrieve
1307
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1308
+ include_tags (Optional[List[str]]): HTML tags to include
1309
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1310
+ only_main_content (Optional[bool]): Extract main content only
1311
+ wait_for (Optional[int]): Wait time in milliseconds
1312
+ timeout (Optional[int]): Request timeout in milliseconds
1313
+ location (Optional[LocationConfig]): Location configuration
1314
+ mobile (Optional[bool]): Use mobile user agent
1315
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1316
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1317
+ block_ads (Optional[bool]): Block advertisements
1318
+ proxy (Optional[Literal]): Proxy type to use
1319
+ extract (Optional[JsonConfig]): Content extraction config
1320
+ json_options (Optional[JsonConfig]): JSON extraction config
1321
+ actions (Optional[List[Union]]): Actions to perform
1322
+ agent (Optional[AgentOptions]): Agent configuration
1323
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1324
+ **kwargs: Additional parameters to pass to the API
1325
+
1326
+ Returns:
1327
+ BatchScrapeResponse with:
1328
+ * success - Whether job started successfully
1329
+ * id - Unique identifier for the job
1330
+ * url - Status check URL
1331
+ * error - Error message if start failed
1332
+
1333
+ Raises:
1334
+ Exception: If job initiation fails
1335
+ """
1336
+ # Validate any additional kwargs
1337
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1338
+
1339
+ scrape_params = {}
1340
+
1341
+ # Add individual parameters
1342
+ if formats is not None:
1343
+ scrape_params['formats'] = formats
1344
+ if headers is not None:
1345
+ scrape_params['headers'] = headers
1346
+ if include_tags is not None:
1347
+ scrape_params['includeTags'] = include_tags
1348
+ if exclude_tags is not None:
1349
+ scrape_params['excludeTags'] = exclude_tags
1350
+ if only_main_content is not None:
1351
+ scrape_params['onlyMainContent'] = only_main_content
1352
+ if wait_for is not None:
1353
+ scrape_params['waitFor'] = wait_for
1354
+ if timeout is not None:
1355
+ scrape_params['timeout'] = timeout
1356
+ if location is not None:
1357
+ scrape_params['location'] = location.dict(exclude_none=True)
1358
+ if mobile is not None:
1359
+ scrape_params['mobile'] = mobile
1360
+ if skip_tls_verification is not None:
1361
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1362
+ if remove_base64_images is not None:
1363
+ scrape_params['removeBase64Images'] = remove_base64_images
1364
+ if block_ads is not None:
1365
+ scrape_params['blockAds'] = block_ads
1366
+ if proxy is not None:
1367
+ scrape_params['proxy'] = proxy
1368
+ if extract is not None:
1369
+ if hasattr(extract.schema, 'schema'):
1370
+ extract.schema = extract.schema.schema()
1371
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1372
+ if json_options is not None:
1373
+ if hasattr(json_options.schema, 'schema'):
1374
+ json_options.schema = json_options.schema.schema()
1375
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1376
+ if actions is not None:
1377
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1378
+ if agent is not None:
1379
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1380
+
1381
+ # Add any additional kwargs
1382
+ scrape_params.update(kwargs)
1383
+
1384
+ # Create final params object
1385
+ final_params = ScrapeParams(**scrape_params)
1386
+ params_dict = final_params.dict(exclude_none=True)
1387
+ params_dict['urls'] = urls
1388
+ params_dict['origin'] = f"python-sdk@{version}"
1389
+
1390
+ # Make request
1391
+ headers = self._prepare_headers(idempotency_key)
1392
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1393
+
1394
+ if response.status_code == 200:
1395
+ try:
1396
+ return BatchScrapeResponse(**response.json())
1397
+ except:
1398
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1399
+ else:
1400
+ self._handle_error(response, 'start batch scrape job')
1401
+
1402
+ def batch_scrape_urls_and_watch(
1403
+ self,
1404
+ urls: List[str],
1405
+ *,
1406
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1407
+ headers: Optional[Dict[str, str]] = None,
1408
+ include_tags: Optional[List[str]] = None,
1409
+ exclude_tags: Optional[List[str]] = None,
1410
+ only_main_content: Optional[bool] = None,
1411
+ wait_for: Optional[int] = None,
1412
+ timeout: Optional[int] = None,
1413
+ location: Optional[LocationConfig] = None,
1414
+ mobile: Optional[bool] = None,
1415
+ skip_tls_verification: Optional[bool] = None,
1416
+ remove_base64_images: Optional[bool] = None,
1417
+ block_ads: Optional[bool] = None,
1418
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1419
+ extract: Optional[JsonConfig] = None,
1420
+ json_options: Optional[JsonConfig] = None,
1421
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1422
+ agent: Optional[AgentOptions] = None,
1423
+ idempotency_key: Optional[str] = None,
1424
+ **kwargs
1425
+ ) -> 'CrawlWatcher':
1426
+ """
1427
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1428
+
1429
+ Args:
1430
+ urls (List[str]): URLs to scrape
1431
+ formats (Optional[List[Literal]]): Content formats to retrieve
1432
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1433
+ include_tags (Optional[List[str]]): HTML tags to include
1434
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1435
+ only_main_content (Optional[bool]): Extract main content only
1436
+ wait_for (Optional[int]): Wait time in milliseconds
1437
+ timeout (Optional[int]): Request timeout in milliseconds
1438
+ location (Optional[LocationConfig]): Location configuration
1439
+ mobile (Optional[bool]): Use mobile user agent
1440
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1441
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1442
+ block_ads (Optional[bool]): Block advertisements
1443
+ proxy (Optional[Literal]): Proxy type to use
1444
+ extract (Optional[JsonConfig]): Content extraction config
1445
+ json_options (Optional[JsonConfig]): JSON extraction config
1446
+ actions (Optional[List[Union]]): Actions to perform
1447
+ agent (Optional[AgentOptions]): Agent configuration
1448
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1449
+ **kwargs: Additional parameters to pass to the API
1450
+
1451
+ Returns:
1452
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1453
+
1454
+ Raises:
1455
+ Exception: If batch scrape job fails to start
1456
+ """
1457
+ # Validate any additional kwargs
1458
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1459
+
1460
+ scrape_params = {}
1461
+
1462
+ # Add individual parameters
1463
+ if formats is not None:
1464
+ scrape_params['formats'] = formats
1465
+ if headers is not None:
1466
+ scrape_params['headers'] = headers
1467
+ if include_tags is not None:
1468
+ scrape_params['includeTags'] = include_tags
1469
+ if exclude_tags is not None:
1470
+ scrape_params['excludeTags'] = exclude_tags
1471
+ if only_main_content is not None:
1472
+ scrape_params['onlyMainContent'] = only_main_content
1473
+ if wait_for is not None:
1474
+ scrape_params['waitFor'] = wait_for
1475
+ if timeout is not None:
1476
+ scrape_params['timeout'] = timeout
1477
+ if location is not None:
1478
+ scrape_params['location'] = location.dict(exclude_none=True)
1479
+ if mobile is not None:
1480
+ scrape_params['mobile'] = mobile
1481
+ if skip_tls_verification is not None:
1482
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1483
+ if remove_base64_images is not None:
1484
+ scrape_params['removeBase64Images'] = remove_base64_images
1485
+ if block_ads is not None:
1486
+ scrape_params['blockAds'] = block_ads
1487
+ if proxy is not None:
1488
+ scrape_params['proxy'] = proxy
1489
+ if extract is not None:
1490
+ if hasattr(extract.schema, 'schema'):
1491
+ extract.schema = extract.schema.schema()
1492
+ scrape_params['extract'] = extract.dict(exclude_none=True)
1493
+ if json_options is not None:
1494
+ if hasattr(json_options.schema, 'schema'):
1495
+ json_options.schema = json_options.schema.schema()
1496
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1497
+ if actions is not None:
1498
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1499
+ if agent is not None:
1500
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1501
+
1502
+ # Add any additional kwargs
1503
+ scrape_params.update(kwargs)
1504
+
1505
+ # Create final params object
1506
+ final_params = ScrapeParams(**scrape_params)
1507
+ params_dict = final_params.dict(exclude_none=True)
1508
+ params_dict['urls'] = urls
1509
+ params_dict['origin'] = f"python-sdk@{version}"
1510
+
1511
+ # Make request
1512
+ headers = self._prepare_headers(idempotency_key)
1513
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1514
+
1515
+ if response.status_code == 200:
1516
+ try:
1517
+ crawl_response = BatchScrapeResponse(**response.json())
1518
+ if crawl_response.success and crawl_response.id:
1519
+ return CrawlWatcher(crawl_response.id, self)
1520
+ else:
1521
+ raise Exception("Batch scrape job failed to start")
1522
+ except:
1523
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1524
+ else:
1525
+ self._handle_error(response, 'start batch scrape job')
1526
+
1527
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1528
+ """
1529
+ Check the status of a batch scrape job using the Firecrawl API.
1530
+
1531
+ Args:
1532
+ id (str): The ID of the batch scrape job.
1533
+
1534
+ Returns:
1535
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1536
+
1537
+ Raises:
1538
+ Exception: If the status check request fails.
1539
+ """
1540
+ endpoint = f'/v1/batch/scrape/{id}'
1541
+
1542
+ headers = self._prepare_headers()
1543
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1544
+ if response.status_code == 200:
1545
+ try:
1546
+ status_data = response.json()
1547
+ except:
1548
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1549
+ if status_data['status'] == 'completed':
1550
+ if 'data' in status_data:
1551
+ data = status_data['data']
1552
+ while 'next' in status_data:
1553
+ if len(status_data['data']) == 0:
1554
+ break
1555
+ next_url = status_data.get('next')
1556
+ if not next_url:
1557
+ logger.warning("Expected 'next' URL is missing.")
1558
+ break
1559
+ try:
1560
+ status_response = self._get_request(next_url, headers)
1561
+ if status_response.status_code != 200:
1562
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1563
+ break
1564
+ try:
1565
+ next_data = status_response.json()
1566
+ except:
1567
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1568
+ data.extend(next_data.get('data', []))
1569
+ status_data = next_data
1570
+ except Exception as e:
1571
+ logger.error(f"Error during pagination request: {e}")
1572
+ break
1573
+ status_data['data'] = data
1574
+
1575
+ return BatchScrapeStatusResponse(**{
1576
+ 'success': False if 'error' in status_data else True,
1577
+ 'status': status_data.get('status'),
1578
+ 'total': status_data.get('total'),
1579
+ 'completed': status_data.get('completed'),
1580
+ 'creditsUsed': status_data.get('creditsUsed'),
1581
+ 'expiresAt': status_data.get('expiresAt'),
1582
+ 'data': status_data.get('data'),
1583
+ 'next': status_data.get('next'),
1584
+ 'error': status_data.get('error')
1585
+ })
1586
+ else:
1587
+ self._handle_error(response, 'check batch scrape status')
1588
+
1589
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1590
+ """
1591
+ Returns information about batch scrape errors.
1592
+
1593
+ Args:
1594
+ id (str): The ID of the crawl job.
1595
+
1596
+ Returns:
1597
+ CrawlErrorsResponse: A response containing:
1598
+ * errors (List[Dict[str, str]]): List of errors with fields:
1599
+ * id (str): Error ID
1600
+ * timestamp (str): When the error occurred
1601
+ * url (str): URL that caused the error
1602
+ * error (str): Error message
1603
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1604
+
1605
+ Raises:
1606
+ Exception: If the error check request fails
1607
+ """
1608
+ headers = self._prepare_headers()
1609
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1610
+ if response.status_code == 200:
1611
+ try:
1612
+ return CrawlErrorsResponse(**response.json())
1613
+ except:
1614
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1615
+ else:
1616
+ self._handle_error(response, "check batch scrape errors")
1617
+
1618
+ def extract(
1619
+ self,
1620
+ urls: Optional[List[str]] = None,
1621
+ *,
1622
+ prompt: Optional[str] = None,
1623
+ schema: Optional[Any] = None,
1624
+ system_prompt: Optional[str] = None,
1625
+ allow_external_links: Optional[bool] = False,
1626
+ enable_web_search: Optional[bool] = False,
1627
+ show_sources: Optional[bool] = False,
1628
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1629
+ """
1630
+ Extract structured information from URLs.
1631
+
1632
+ Args:
1633
+ urls (Optional[List[str]]): URLs to extract from
1634
+ prompt (Optional[str]): Custom extraction prompt
1635
+ schema (Optional[Any]): JSON schema/Pydantic model
1636
+ system_prompt (Optional[str]): System context
1637
+ allow_external_links (Optional[bool]): Follow external links
1638
+ enable_web_search (Optional[bool]): Enable web search
1639
+ show_sources (Optional[bool]): Include source URLs
1640
+ agent (Optional[Dict[str, Any]]): Agent configuration
1641
+
1642
+ Returns:
1643
+ ExtractResponse[Any] with:
1644
+ * success (bool): Whether request succeeded
1645
+ * data (Optional[Any]): Extracted data matching schema
1646
+ * error (Optional[str]): Error message if any
1647
+
1648
+ Raises:
1649
+ ValueError: If prompt/schema missing or extraction fails
1650
+ """
1651
+ headers = self._prepare_headers()
1652
+
1653
+ if not prompt and not schema:
1654
+ raise ValueError("Either prompt or schema is required")
1655
+
1656
+ if not urls and not prompt:
1657
+ raise ValueError("Either urls or prompt is required")
1658
+
1659
+ if schema:
1660
+ if hasattr(schema, 'model_json_schema'):
1661
+ # Convert Pydantic model to JSON schema
1662
+ schema = schema.model_json_schema()
1663
+ # Otherwise assume it's already a JSON schema dict
1664
+
1665
+ request_data = {
1666
+ 'urls': urls or [],
1667
+ 'allowExternalLinks': allow_external_links,
1668
+ 'enableWebSearch': enable_web_search,
1669
+ 'showSources': show_sources,
1670
+ 'schema': schema,
1671
+ 'origin': f'python-sdk@{get_version()}'
1672
+ }
1673
+
1674
+ # Only add prompt and systemPrompt if they exist
1675
+ if prompt:
1676
+ request_data['prompt'] = prompt
1677
+ if system_prompt:
1678
+ request_data['systemPrompt'] = system_prompt
1679
+
1680
+ if agent:
1681
+ request_data['agent'] = agent
1682
+
1683
+ try:
1684
+ # Send the initial extract request
1685
+ response = self._post_request(
1686
+ f'{self.api_url}/v1/extract',
1687
+ request_data,
1688
+ headers
1689
+ )
1690
+ if response.status_code == 200:
1691
+ try:
1692
+ data = response.json()
1693
+ except:
1694
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1695
+ if data['success']:
1696
+ job_id = data.get('id')
1697
+ if not job_id:
1698
+ raise Exception('Job ID not returned from extract request.')
1699
+
1700
+ # Poll for the extract status
1701
+ while True:
1702
+ status_response = self._get_request(
1703
+ f'{self.api_url}/v1/extract/{job_id}',
1704
+ headers
1705
+ )
1706
+ if status_response.status_code == 200:
1707
+ try:
1708
+ status_data = status_response.json()
1709
+ except:
1710
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1711
+ if status_data['status'] == 'completed':
1712
+ return ExtractResponse(**status_data)
1713
+ elif status_data['status'] in ['failed', 'cancelled']:
1714
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1715
+ else:
1716
+ self._handle_error(status_response, "extract-status")
1717
+
1718
+ time.sleep(2) # Polling interval
1719
+ else:
1720
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1721
+ else:
1722
+ self._handle_error(response, "extract")
1723
+ except Exception as e:
1724
+ raise ValueError(str(e), 500)
1725
+
1726
+ return ExtractResponse(success=False, error="Internal server error.")
1727
+
1728
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1729
+ """
1730
+ Retrieve the status of an extract job.
1731
+
1732
+ Args:
1733
+ job_id (str): The ID of the extract job.
1734
+
1735
+ Returns:
1736
+ ExtractResponse[Any]: The status of the extract job.
1737
+
1738
+ Raises:
1739
+ ValueError: If there is an error retrieving the status.
1740
+ """
1741
+ headers = self._prepare_headers()
1742
+ try:
1743
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1744
+ if response.status_code == 200:
1745
+ try:
1746
+ return ExtractResponse(**response.json())
1747
+ except:
1748
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1749
+ else:
1750
+ self._handle_error(response, "get extract status")
1751
+ except Exception as e:
1752
+ raise ValueError(str(e), 500)
1753
+
1754
+ def async_extract(
1755
+ self,
1756
+ urls: Optional[List[str]] = None,
1757
+ *,
1758
+ prompt: Optional[str] = None,
1759
+ schema: Optional[Any] = None,
1760
+ system_prompt: Optional[str] = None,
1761
+ allow_external_links: Optional[bool] = False,
1762
+ enable_web_search: Optional[bool] = False,
1763
+ show_sources: Optional[bool] = False,
1764
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1765
+ """
1766
+ Initiate an asynchronous extract job.
1767
+
1768
+ Args:
1769
+ urls (List[str]): URLs to extract information from
1770
+ prompt (Optional[str]): Custom extraction prompt
1771
+ schema (Optional[Any]): JSON schema/Pydantic model
1772
+ system_prompt (Optional[str]): System context
1773
+ allow_external_links (Optional[bool]): Follow external links
1774
+ enable_web_search (Optional[bool]): Enable web search
1775
+ show_sources (Optional[bool]): Include source URLs
1776
+ agent (Optional[Dict[str, Any]]): Agent configuration
1777
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1778
+
1779
+ Returns:
1780
+ ExtractResponse[Any] with:
1781
+ * success (bool): Whether request succeeded
1782
+ * data (Optional[Any]): Extracted data matching schema
1783
+ * error (Optional[str]): Error message if any
1784
+
1785
+ Raises:
1786
+ ValueError: If job initiation fails
1787
+ """
1788
+ headers = self._prepare_headers()
1789
+
1790
+ schema = schema
1791
+ if schema:
1792
+ if hasattr(schema, 'model_json_schema'):
1793
+ # Convert Pydantic model to JSON schema
1794
+ schema = schema.model_json_schema()
1795
+ # Otherwise assume it's already a JSON schema dict
1796
+
1797
+ request_data = {
1798
+ 'urls': urls,
1799
+ 'allowExternalLinks': allow_external_links,
1800
+ 'enableWebSearch': enable_web_search,
1801
+ 'showSources': show_sources,
1802
+ 'schema': schema,
1803
+ 'origin': f'python-sdk@{version}'
1804
+ }
1805
+
1806
+ if prompt:
1807
+ request_data['prompt'] = prompt
1808
+ if system_prompt:
1809
+ request_data['systemPrompt'] = system_prompt
1810
+ if agent:
1811
+ request_data['agent'] = agent
1812
+
1813
+ try:
1814
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1815
+ if response.status_code == 200:
1816
+ try:
1817
+ return ExtractResponse(**response.json())
1818
+ except:
1819
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1820
+ else:
1821
+ self._handle_error(response, "async extract")
1822
+ except Exception as e:
1823
+ raise ValueError(str(e), 500)
1824
+
1825
+ def generate_llms_text(
1826
+ self,
1827
+ url: str,
1828
+ *,
1829
+ max_urls: Optional[int] = None,
1830
+ show_full_text: Optional[bool] = None,
1831
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1832
+ """
1833
+ Generate LLMs.txt for a given URL and poll until completion.
1834
+
1835
+ Args:
1836
+ url (str): Target URL to generate LLMs.txt from
1837
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1838
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1839
+ experimental_stream (Optional[bool]): Enable experimental streaming
1840
+
1841
+ Returns:
1842
+ GenerateLLMsTextStatusResponse with:
1843
+ * Generated LLMs.txt content
1844
+ * Full version if requested
1845
+ * Generation status
1846
+ * Success/error information
1847
+
1848
+ Raises:
1849
+ Exception: If generation fails
1850
+ """
1851
+ params = GenerateLLMsTextParams(
1852
+ maxUrls=max_urls,
1853
+ showFullText=show_full_text,
1854
+ __experimental_stream=experimental_stream
1855
+ )
1856
+
1857
+ response = self.async_generate_llms_text(
1858
+ url,
1859
+ max_urls=max_urls,
1860
+ show_full_text=show_full_text,
1861
+ experimental_stream=experimental_stream
1862
+ )
1863
+
1864
+ if not response.success or not response.id:
1865
+ return GenerateLLMsTextStatusResponse(
1866
+ success=False,
1867
+ error='Failed to start LLMs.txt generation',
1868
+ status='failed',
1869
+ expiresAt=''
1870
+ )
1871
+
1872
+ job_id = response.id
1873
+ while True:
1874
+ status = self.check_generate_llms_text_status(job_id)
1875
+
1876
+ if status.status == 'completed':
1877
+ return status
1878
+ elif status.status == 'failed':
1879
+ return status
1880
+ elif status.status != 'processing':
1881
+ return GenerateLLMsTextStatusResponse(
1882
+ success=False,
1883
+ error='LLMs.txt generation job terminated unexpectedly',
1884
+ status='failed',
1885
+ expiresAt=''
1886
+ )
1887
+
1888
+ time.sleep(2) # Polling interval
1889
+
1890
+ def async_generate_llms_text(
1891
+ self,
1892
+ url: str,
1893
+ *,
1894
+ max_urls: Optional[int] = None,
1895
+ show_full_text: Optional[bool] = None,
1896
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1897
+ """
1898
+ Initiate an asynchronous LLMs.txt generation operation.
1899
+
1900
+ Args:
1901
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1902
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1903
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1904
+ experimental_stream (Optional[bool]): Enable experimental streaming
1905
+
1906
+ Returns:
1907
+ GenerateLLMsTextResponse: A response containing:
1908
+ * success (bool): Whether the generation initiation was successful
1909
+ * id (str): The unique identifier for the generation job
1910
+ * error (str, optional): Error message if initiation failed
1911
+
1912
+ Raises:
1913
+ Exception: If the generation job initiation fails.
1914
+ """
1915
+ params = GenerateLLMsTextParams(
1916
+ maxUrls=max_urls,
1917
+ showFullText=show_full_text,
1918
+ __experimental_stream=experimental_stream
1919
+ )
1920
+
1921
+ headers = self._prepare_headers()
1922
+ json_data = {'url': url, **params.dict(exclude_none=True)}
1923
+ json_data['origin'] = f"python-sdk@{version}"
1924
+
1925
+ try:
1926
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1927
+ response = req.json()
1928
+ print("json_data", json_data)
1929
+ print("response", response)
1930
+ if response.get('success'):
1931
+ try:
1932
+ return GenerateLLMsTextResponse(**response)
1933
+ except:
1934
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1935
+ else:
1936
+ self._handle_error(response, 'start LLMs.txt generation')
1937
+ except Exception as e:
1938
+ raise ValueError(str(e))
1939
+
1940
+ return GenerateLLMsTextResponse(
1941
+ success=False,
1942
+ error='Internal server error'
1943
+ )
1944
+
1945
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
1946
+ """
1947
+ Check the status of a LLMs.txt generation operation.
1948
+
1949
+ Args:
1950
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
1951
+
1952
+ Returns:
1953
+ GenerateLLMsTextStatusResponse: A response containing:
1954
+ * success (bool): Whether the generation was successful
1955
+ * status (str): Status of generation ("processing", "completed", "failed")
1956
+ * data (Dict[str, str], optional): Generated text with fields:
1957
+ * llmstxt (str): Generated LLMs.txt content
1958
+ * llmsfulltxt (str, optional): Full version if requested
1959
+ * error (str, optional): Error message if generation failed
1960
+ * expiresAt (str): When the generated data expires
1961
+
1962
+ Raises:
1963
+ Exception: If the status check fails.
1964
+ """
1965
+ headers = self._prepare_headers()
1966
+ try:
1967
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
1968
+ if response.status_code == 200:
1969
+ try:
1970
+ json_data = response.json()
1971
+ return GenerateLLMsTextStatusResponse(**json_data)
1972
+ except Exception as e:
1973
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
1974
+ elif response.status_code == 404:
1975
+ raise Exception('LLMs.txt generation job not found')
1976
+ else:
1977
+ self._handle_error(response, 'check LLMs.txt generation status')
1978
+ except Exception as e:
1979
+ raise ValueError(str(e))
1980
+
1981
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
1982
+
1983
+ def _prepare_headers(
1984
+ self,
1985
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
1986
+ """
1987
+ Prepare the headers for API requests.
1988
+
1989
+ Args:
1990
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
1991
+
1992
+ Returns:
1993
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
1994
+ """
1995
+ if idempotency_key:
1996
+ return {
1997
+ 'Content-Type': 'application/json',
1998
+ 'Authorization': f'Bearer {self.api_key}',
1999
+ 'x-idempotency-key': idempotency_key
2000
+ }
2001
+
2002
+ return {
2003
+ 'Content-Type': 'application/json',
2004
+ 'Authorization': f'Bearer {self.api_key}',
2005
+ }
2006
+
2007
+ def _post_request(
2008
+ self,
2009
+ url: str,
2010
+ data: Dict[str, Any],
2011
+ headers: Dict[str, str],
2012
+ retries: int = 3,
2013
+ backoff_factor: float = 0.5) -> requests.Response:
2014
+ """
2015
+ Make a POST request with retries.
2016
+
2017
+ Args:
2018
+ url (str): The URL to send the POST request to.
2019
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2020
+ headers (Dict[str, str]): The headers to include in the POST request.
2021
+ retries (int): Number of retries for the request.
2022
+ backoff_factor (float): Backoff factor for retries.
2023
+
2024
+ Returns:
2025
+ requests.Response: The response from the POST request.
2026
+
2027
+ Raises:
2028
+ requests.RequestException: If the request fails after the specified retries.
2029
+ """
2030
+ for attempt in range(retries):
2031
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2032
+ if response.status_code == 502:
2033
+ time.sleep(backoff_factor * (2 ** attempt))
2034
+ else:
2035
+ return response
2036
+ return response
2037
+
2038
+ def _get_request(
2039
+ self,
2040
+ url: str,
2041
+ headers: Dict[str, str],
2042
+ retries: int = 3,
2043
+ backoff_factor: float = 0.5) -> requests.Response:
2044
+ """
2045
+ Make a GET request with retries.
2046
+
2047
+ Args:
2048
+ url (str): The URL to send the GET request to.
2049
+ headers (Dict[str, str]): The headers to include in the GET request.
2050
+ retries (int): Number of retries for the request.
2051
+ backoff_factor (float): Backoff factor for retries.
2052
+
2053
+ Returns:
2054
+ requests.Response: The response from the GET request.
2055
+
2056
+ Raises:
2057
+ requests.RequestException: If the request fails after the specified retries.
2058
+ """
2059
+ for attempt in range(retries):
2060
+ response = requests.get(url, headers=headers)
2061
+ if response.status_code == 502:
2062
+ time.sleep(backoff_factor * (2 ** attempt))
2063
+ else:
2064
+ return response
2065
+ return response
2066
+
2067
+ def _delete_request(
2068
+ self,
2069
+ url: str,
2070
+ headers: Dict[str, str],
2071
+ retries: int = 3,
2072
+ backoff_factor: float = 0.5) -> requests.Response:
2073
+ """
2074
+ Make a DELETE request with retries.
2075
+
2076
+ Args:
2077
+ url (str): The URL to send the DELETE request to.
2078
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2079
+ retries (int): Number of retries for the request.
2080
+ backoff_factor (float): Backoff factor for retries.
2081
+
2082
+ Returns:
2083
+ requests.Response: The response from the DELETE request.
2084
+
2085
+ Raises:
2086
+ requests.RequestException: If the request fails after the specified retries.
2087
+ """
2088
+ for attempt in range(retries):
2089
+ response = requests.delete(url, headers=headers)
2090
+ if response.status_code == 502:
2091
+ time.sleep(backoff_factor * (2 ** attempt))
2092
+ else:
2093
+ return response
2094
+ return response
2095
+
2096
+ def _monitor_job_status(
2097
+ self,
2098
+ id: str,
2099
+ headers: Dict[str, str],
2100
+ poll_interval: int) -> CrawlStatusResponse:
2101
+ """
2102
+ Monitor the status of a crawl job until completion.
2103
+
2104
+ Args:
2105
+ id (str): The ID of the crawl job.
2106
+ headers (Dict[str, str]): The headers to include in the status check requests.
2107
+ poll_interval (int): Seconds between status checks.
2108
+
2109
+ Returns:
2110
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2111
+
2112
+ Raises:
2113
+ Exception: If the job fails or an error occurs during status checks.
2114
+ """
2115
+ while True:
2116
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2117
+
2118
+ status_response = self._get_request(api_url, headers)
2119
+ if status_response.status_code == 200:
2120
+ try:
2121
+ status_data = status_response.json()
2122
+ except:
2123
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2124
+ if status_data['status'] == 'completed':
2125
+ if 'data' in status_data:
2126
+ data = status_data['data']
2127
+ while 'next' in status_data:
2128
+ if len(status_data['data']) == 0:
2129
+ break
2130
+ status_response = self._get_request(status_data['next'], headers)
2131
+ try:
2132
+ status_data = status_response.json()
2133
+ except:
2134
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2135
+ data.extend(status_data.get('data', []))
2136
+ status_data['data'] = data
2137
+ return CrawlStatusResponse(**status_data)
2138
+ else:
2139
+ raise Exception('Crawl job completed but no data was returned')
2140
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2141
+ poll_interval=max(poll_interval,2)
2142
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2143
+ else:
2144
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2145
+ else:
2146
+ self._handle_error(status_response, 'check crawl status')
2147
+
2148
+ def _handle_error(
2149
+ self,
2150
+ response: requests.Response,
2151
+ action: str) -> None:
2152
+ """
2153
+ Handle errors from API responses.
2154
+
2155
+ Args:
2156
+ response (requests.Response): The response object from the API request.
2157
+ action (str): Description of the action that was being performed.
2158
+
2159
+ Raises:
2160
+ Exception: An exception with a message containing the status code and error details from the response.
2161
+ """
2162
+ try:
2163
+ error_message = response.json().get('error', 'No error message provided.')
2164
+ error_details = response.json().get('details', 'No additional error details provided.')
2165
+ except:
2166
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2167
+
2168
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2169
+
2170
+ # Raise an HTTPError with the custom message and attach the response
2171
+ raise requests.exceptions.HTTPError(message, response=response)
2172
+
2173
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2174
+ """
2175
+ Generate a standardized error message based on HTTP status code.
2176
+
2177
+ Args:
2178
+ status_code (int): The HTTP status code from the response
2179
+ action (str): Description of the action that was being performed
2180
+ error_message (str): The error message from the API response
2181
+ error_details (str): Additional error details from the API response
2182
+
2183
+ Returns:
2184
+ str: A formatted error message
2185
+ """
2186
+ if status_code == 402:
2187
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2188
+ elif status_code == 403:
2189
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2190
+ elif status_code == 408:
2191
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2192
+ elif status_code == 409:
2193
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2194
+ elif status_code == 500:
2195
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2196
+ else:
2197
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2198
+
2199
+ def deep_research(
2200
+ self,
2201
+ query: str,
2202
+ *,
2203
+ max_depth: Optional[int] = None,
2204
+ time_limit: Optional[int] = None,
2205
+ max_urls: Optional[int] = None,
2206
+ analysis_prompt: Optional[str] = None,
2207
+ system_prompt: Optional[str] = None,
2208
+ __experimental_stream_steps: Optional[bool] = None,
2209
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2210
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2211
+ """
2212
+ Initiates a deep research operation on a given query and polls until completion.
2213
+
2214
+ Args:
2215
+ query (str): Research query or topic to investigate
2216
+ max_depth (Optional[int]): Maximum depth of research exploration
2217
+ time_limit (Optional[int]): Time limit in seconds for research
2218
+ max_urls (Optional[int]): Maximum number of URLs to process
2219
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2220
+ system_prompt (Optional[str]): Custom system prompt
2221
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2222
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2223
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2224
+
2225
+ Returns:
2226
+ DeepResearchStatusResponse containing:
2227
+ * success (bool): Whether research completed successfully
2228
+ * status (str): Current state (processing/completed/failed)
2229
+ * error (Optional[str]): Error message if failed
2230
+ * id (str): Unique identifier for the research job
2231
+ * data (Any): Research findings and analysis
2232
+ * sources (List[Dict]): List of discovered sources
2233
+ * activities (List[Dict]): Research progress log
2234
+ * summaries (List[str]): Generated research summaries
2235
+
2236
+ Raises:
2237
+ Exception: If research fails
2238
+ """
2239
+ research_params = {}
2240
+ if max_depth is not None:
2241
+ research_params['maxDepth'] = max_depth
2242
+ if time_limit is not None:
2243
+ research_params['timeLimit'] = time_limit
2244
+ if max_urls is not None:
2245
+ research_params['maxUrls'] = max_urls
2246
+ if analysis_prompt is not None:
2247
+ research_params['analysisPrompt'] = analysis_prompt
2248
+ if system_prompt is not None:
2249
+ research_params['systemPrompt'] = system_prompt
2250
+ if __experimental_stream_steps is not None:
2251
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2252
+ research_params = DeepResearchParams(**research_params)
2253
+
2254
+ response = self.async_deep_research(
2255
+ query,
2256
+ max_depth=max_depth,
2257
+ time_limit=time_limit,
2258
+ max_urls=max_urls,
2259
+ analysis_prompt=analysis_prompt,
2260
+ system_prompt=system_prompt
2261
+ )
2262
+ if not response.get('success') or 'id' not in response:
2263
+ return response
2264
+
2265
+ job_id = response['id']
2266
+ last_activity_count = 0
2267
+ last_source_count = 0
2268
+
2269
+ while True:
2270
+ status = self.check_deep_research_status(job_id)
2271
+
2272
+ if on_activity and 'activities' in status:
2273
+ new_activities = status['activities'][last_activity_count:]
2274
+ for activity in new_activities:
2275
+ on_activity(activity)
2276
+ last_activity_count = len(status['activities'])
2277
+
2278
+ if on_source and 'sources' in status:
2279
+ new_sources = status['sources'][last_source_count:]
2280
+ for source in new_sources:
2281
+ on_source(source)
2282
+ last_source_count = len(status['sources'])
2283
+
2284
+ if status['status'] == 'completed':
2285
+ return status
2286
+ elif status['status'] == 'failed':
2287
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2288
+ elif status['status'] != 'processing':
2289
+ break
2290
+
2291
+ time.sleep(2) # Polling interval
2292
+
2293
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2294
+
2295
+ def async_deep_research(
2296
+ self,
2297
+ query: str,
2298
+ *,
2299
+ max_depth: Optional[int] = None,
2300
+ time_limit: Optional[int] = None,
2301
+ max_urls: Optional[int] = None,
2302
+ analysis_prompt: Optional[str] = None,
2303
+ system_prompt: Optional[str] = None,
2304
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2305
+ """
2306
+ Initiates an asynchronous deep research operation.
2307
+
2308
+ Args:
2309
+ query (str): Research query or topic to investigate
2310
+ max_depth (Optional[int]): Maximum depth of research exploration
2311
+ time_limit (Optional[int]): Time limit in seconds for research
2312
+ max_urls (Optional[int]): Maximum number of URLs to process
2313
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2314
+ system_prompt (Optional[str]): Custom system prompt
2315
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2316
+
2317
+ Returns:
2318
+ Dict[str, Any]: A response containing:
2319
+ * success (bool): Whether the research initiation was successful
2320
+ * id (str): The unique identifier for the research job
2321
+ * error (str, optional): Error message if initiation failed
2322
+
2323
+ Raises:
2324
+ Exception: If the research initiation fails.
2325
+ """
2326
+ research_params = {}
2327
+ if max_depth is not None:
2328
+ research_params['maxDepth'] = max_depth
2329
+ if time_limit is not None:
2330
+ research_params['timeLimit'] = time_limit
2331
+ if max_urls is not None:
2332
+ research_params['maxUrls'] = max_urls
2333
+ if analysis_prompt is not None:
2334
+ research_params['analysisPrompt'] = analysis_prompt
2335
+ if system_prompt is not None:
2336
+ research_params['systemPrompt'] = system_prompt
2337
+ if __experimental_stream_steps is not None:
2338
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2339
+ research_params = DeepResearchParams(**research_params)
2340
+
2341
+ headers = self._prepare_headers()
2342
+
2343
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2344
+ json_data['origin'] = f"python-sdk@{version}"
2345
+
2346
+ # Handle json options schema if present
2347
+ if 'jsonOptions' in json_data:
2348
+ json_opts = json_data['jsonOptions']
2349
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2350
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2351
+
2352
+ try:
2353
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2354
+ if response.status_code == 200:
2355
+ try:
2356
+ return response.json()
2357
+ except:
2358
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2359
+ else:
2360
+ self._handle_error(response, 'start deep research')
2361
+ except Exception as e:
2362
+ raise ValueError(str(e))
2363
+
2364
+ return {'success': False, 'error': 'Internal server error'}
2365
+
2366
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2367
+ """
2368
+ Check the status of a deep research operation.
2369
+
2370
+ Args:
2371
+ id (str): The ID of the deep research operation.
2372
+
2373
+ Returns:
2374
+ DeepResearchResponse containing:
2375
+
2376
+ Status:
2377
+ * success - Whether research completed successfully
2378
+ * status - Current state (processing/completed/failed)
2379
+ * error - Error message if failed
2380
+
2381
+ Results:
2382
+ * id - Unique identifier for the research job
2383
+ * data - Research findings and analysis
2384
+ * sources - List of discovered sources
2385
+ * activities - Research progress log
2386
+ * summaries - Generated research summaries
2387
+
2388
+ Raises:
2389
+ Exception: If the status check fails.
2390
+ """
2391
+ headers = self._prepare_headers()
2392
+ try:
2393
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2394
+ if response.status_code == 200:
2395
+ try:
2396
+ return response.json()
2397
+ except:
2398
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2399
+ elif response.status_code == 404:
2400
+ raise Exception('Deep research job not found')
2401
+ else:
2402
+ self._handle_error(response, 'check deep research status')
2403
+ except Exception as e:
2404
+ raise ValueError(str(e))
2405
+
2406
+ return {'success': False, 'error': 'Internal server error'}
2407
+
2408
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2409
+ """
2410
+ Validate additional keyword arguments before they are passed to the API.
2411
+ This provides early validation before the Pydantic model validation.
2412
+
2413
+ Args:
2414
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2415
+ method_name (str): Name of the method these kwargs are for
2416
+
2417
+ Raises:
2418
+ ValueError: If kwargs contain invalid or unsupported parameters
2419
+ """
2420
+ if not kwargs:
2421
+ return
2422
+
2423
+ # Known parameter mappings for each method
2424
+ method_params = {
2425
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2426
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2427
+ "block_ads", "proxy", "extract", "json_options", "actions"},
2428
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2429
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2430
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2431
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2432
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2433
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2434
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2435
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2436
+ "actions", "agent", "webhook"},
2437
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2438
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2439
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2440
+ "actions", "agent", "webhook"},
2441
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2442
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2443
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2444
+ "actions", "agent", "webhook"}
2445
+ }
2446
+
2447
+ # Get allowed parameters for this method
2448
+ allowed_params = method_params.get(method_name, set())
2449
+
2450
+ # Check for unknown parameters
2451
+ unknown_params = set(kwargs.keys()) - allowed_params
2452
+ if unknown_params:
2453
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2454
+
2455
+ # Additional type validation can be added here if needed
2456
+ # For now, we rely on Pydantic models for detailed type validation
2457
+
2458
+ class CrawlWatcher:
2459
+ """
2460
+ A class to watch and handle crawl job events via WebSocket connection.
2461
+
2462
+ Attributes:
2463
+ id (str): The ID of the crawl job to watch
2464
+ app (FirecrawlApp): The FirecrawlApp instance
2465
+ data (List[Dict[str, Any]]): List of crawled documents/data
2466
+ status (str): Current status of the crawl job
2467
+ ws_url (str): WebSocket URL for the crawl job
2468
+ event_handlers (dict): Dictionary of event type to list of handler functions
2469
+ """
2470
+ def __init__(self, id: str, app: FirecrawlApp):
2471
+ self.id = id
2472
+ self.app = app
2473
+ self.data: List[Dict[str, Any]] = []
2474
+ self.status = "scraping"
2475
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2476
+ self.event_handlers = {
2477
+ 'done': [],
2478
+ 'error': [],
2479
+ 'document': []
2480
+ }
2481
+
2482
+ async def connect(self) -> None:
2483
+ """
2484
+ Establishes WebSocket connection and starts listening for messages.
2485
+ """
2486
+ async with websockets.connect(
2487
+ self.ws_url,
2488
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2489
+ ) as websocket:
2490
+ await self._listen(websocket)
2491
+
2492
+ async def _listen(self, websocket) -> None:
2493
+ """
2494
+ Listens for incoming WebSocket messages and handles them.
2495
+
2496
+ Args:
2497
+ websocket: The WebSocket connection object
2498
+ """
2499
+ async for message in websocket:
2500
+ msg = json.loads(message)
2501
+ await self._handle_message(msg)
2502
+
2503
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2504
+ """
2505
+ Adds an event handler function for a specific event type.
2506
+
2507
+ Args:
2508
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2509
+ handler (Callable): Function to handle the event
2510
+ """
2511
+ if event_type in self.event_handlers:
2512
+ self.event_handlers[event_type].append(handler)
2513
+
2514
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2515
+ """
2516
+ Dispatches an event to all registered handlers for that event type.
2517
+
2518
+ Args:
2519
+ event_type (str): Type of event to dispatch
2520
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2521
+ """
2522
+ if event_type in self.event_handlers:
2523
+ for handler in self.event_handlers[event_type]:
2524
+ handler(detail)
2525
+
2526
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2527
+ """
2528
+ Handles incoming WebSocket messages based on their type.
2529
+
2530
+ Args:
2531
+ msg (Dict[str, Any]): The message to handle
2532
+ """
2533
+ if msg['type'] == 'done':
2534
+ self.status = 'completed'
2535
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2536
+ elif msg['type'] == 'error':
2537
+ self.status = 'failed'
2538
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2539
+ elif msg['type'] == 'catchup':
2540
+ self.status = msg['data']['status']
2541
+ self.data.extend(msg['data'].get('data', []))
2542
+ for doc in self.data:
2543
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2544
+ elif msg['type'] == 'document':
2545
+ self.data.append(msg['data'])
2546
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2547
+
2548
+ class AsyncFirecrawlApp(FirecrawlApp):
2549
+ """
2550
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2551
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2552
+ """
2553
+
2554
+ async def _async_request(
2555
+ self,
2556
+ method: str,
2557
+ url: str,
2558
+ headers: Dict[str, str],
2559
+ data: Optional[Dict[str, Any]] = None,
2560
+ retries: int = 3,
2561
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2562
+ """
2563
+ Generic async request method with exponential backoff retry logic.
2564
+
2565
+ Args:
2566
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2567
+ url (str): The URL to send the request to.
2568
+ headers (Dict[str, str]): Headers to include in the request.
2569
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2570
+ retries (int): Maximum number of retry attempts (default: 3).
2571
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2572
+ Delay will be backoff_factor * (2 ** retry_count).
2573
+
2574
+ Returns:
2575
+ Dict[str, Any]: The parsed JSON response from the server.
2576
+
2577
+ Raises:
2578
+ aiohttp.ClientError: If the request fails after all retries.
2579
+ Exception: If max retries are exceeded or other errors occur.
2580
+ """
2581
+ async with aiohttp.ClientSession() as session:
2582
+ for attempt in range(retries):
2583
+ try:
2584
+ async with session.request(
2585
+ method=method, url=url, headers=headers, json=data
2586
+ ) as response:
2587
+ if response.status == 502:
2588
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2589
+ continue
2590
+ if response.status >= 300:
2591
+ await self._handle_error(response, f"make {method} request")
2592
+ return await response.json()
2593
+ except aiohttp.ClientError as e:
2594
+ if attempt == retries - 1:
2595
+ raise e
2596
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2597
+ raise Exception("Max retries exceeded")
2598
+
2599
+ async def _async_post_request(
2600
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2601
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2602
+ """
2603
+ Make an async POST request with exponential backoff retry logic.
2604
+
2605
+ Args:
2606
+ url (str): The URL to send the POST request to.
2607
+ data (Dict[str, Any]): The JSON data to include in the request body.
2608
+ headers (Dict[str, str]): Headers to include in the request.
2609
+ retries (int): Maximum number of retry attempts (default: 3).
2610
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2611
+ Delay will be backoff_factor * (2 ** retry_count).
2612
+
2613
+ Returns:
2614
+ Dict[str, Any]: The parsed JSON response from the server.
2615
+
2616
+ Raises:
2617
+ aiohttp.ClientError: If the request fails after all retries.
2618
+ Exception: If max retries are exceeded or other errors occur.
2619
+ """
2620
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2621
+
2622
+ async def _async_get_request(
2623
+ self, url: str, headers: Dict[str, str],
2624
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2625
+ """
2626
+ Make an async GET request with exponential backoff retry logic.
2627
+
2628
+ Args:
2629
+ url (str): The URL to send the GET request to.
2630
+ headers (Dict[str, str]): Headers to include in the request.
2631
+ retries (int): Maximum number of retry attempts (default: 3).
2632
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2633
+ Delay will be backoff_factor * (2 ** retry_count).
2634
+
2635
+ Returns:
2636
+ Dict[str, Any]: The parsed JSON response from the server.
2637
+
2638
+ Raises:
2639
+ aiohttp.ClientError: If the request fails after all retries.
2640
+ Exception: If max retries are exceeded or other errors occur.
2641
+ """
2642
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2643
+
2644
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2645
+ """
2646
+ Handle errors from async API responses with detailed error messages.
2647
+
2648
+ Args:
2649
+ response (aiohttp.ClientResponse): The response object from the failed request
2650
+ action (str): Description of the action that was being attempted
2651
+
2652
+ Raises:
2653
+ aiohttp.ClientError: With a detailed error message based on the response status:
2654
+ - 402: Payment Required
2655
+ - 408: Request Timeout
2656
+ - 409: Conflict
2657
+ - 500: Internal Server Error
2658
+ - Other: Unexpected error with status code
2659
+ """
2660
+ try:
2661
+ error_data = await response.json()
2662
+ error_message = error_data.get('error', 'No error message provided.')
2663
+ error_details = error_data.get('details', 'No additional error details provided.')
2664
+ except:
2665
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2666
+
2667
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2668
+
2669
+ raise aiohttp.ClientError(message)
2670
+
2671
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2672
+ """
2673
+ Generate a standardized error message based on HTTP status code for async operations.
2674
+
2675
+ Args:
2676
+ status_code (int): The HTTP status code from the response
2677
+ action (str): Description of the action that was being performed
2678
+ error_message (str): The error message from the API response
2679
+ error_details (str): Additional error details from the API response
2680
+
2681
+ Returns:
2682
+ str: A formatted error message
2683
+ """
2684
+ return self._get_error_message(status_code, action, error_message, error_details)
2685
+
2686
+ async def crawl_url_and_watch(
2687
+ self,
2688
+ url: str,
2689
+ params: Optional[CrawlParams] = None,
2690
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2691
+ """
2692
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2693
+
2694
+ Args:
2695
+ url (str): Target URL to start crawling from
2696
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2697
+ URL Discovery:
2698
+ * includePaths - Patterns of URLs to include
2699
+ * excludePaths - Patterns of URLs to exclude
2700
+ * maxDepth - Maximum crawl depth
2701
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2702
+ * limit - Maximum pages to crawl
2703
+
2704
+ Link Following:
2705
+ * allowBackwardLinks - Follow parent directory links
2706
+ * allowExternalLinks - Follow external domain links
2707
+ * ignoreSitemap - Skip sitemap.xml processing
2708
+
2709
+ Advanced:
2710
+ * scrapeOptions - Page scraping configuration
2711
+ * webhook - Notification webhook settings
2712
+ * deduplicateSimilarURLs - Remove similar URLs
2713
+ * ignoreQueryParameters - Ignore URL parameters
2714
+ * regexOnFullURL - Apply regex to full URLs
2715
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2716
+
2717
+ Returns:
2718
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2719
+
2720
+ Raises:
2721
+ Exception: If crawl job fails to start
2722
+ """
2723
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2724
+ if crawl_response.get('success') and 'id' in crawl_response:
2725
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2726
+ else:
2727
+ raise Exception("Crawl job failed to start")
2728
+
2729
+ async def batch_scrape_urls_and_watch(
2730
+ self,
2731
+ urls: List[str],
2732
+ params: Optional[ScrapeParams] = None,
2733
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2734
+ """
2735
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2736
+
2737
+ Args:
2738
+ urls (List[str]): List of URLs to scrape
2739
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2740
+
2741
+ Content Options:
2742
+ * formats - Content formats to retrieve
2743
+ * includeTags - HTML tags to include
2744
+ * excludeTags - HTML tags to exclude
2745
+ * onlyMainContent - Extract main content only
2746
+
2747
+ Request Options:
2748
+ * headers - Custom HTTP headers
2749
+ * timeout - Request timeout (ms)
2750
+ * mobile - Use mobile user agent
2751
+ * proxy - Proxy type
2752
+
2753
+ Extraction Options:
2754
+ * extract - Content extraction config
2755
+ * jsonOptions - JSON extraction config
2756
+ * actions - Actions to perform
2757
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2758
+
2759
+ Returns:
2760
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2761
+
2762
+ Raises:
2763
+ Exception: If batch scrape job fails to start
2764
+ """
2765
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2766
+ if batch_response.get('success') and 'id' in batch_response:
2767
+ return AsyncCrawlWatcher(batch_response['id'], self)
2768
+ else:
2769
+ raise Exception("Batch scrape job failed to start")
2770
+
2771
+ async def scrape_url(
2772
+ self,
2773
+ url: str,
2774
+ *,
2775
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2776
+ include_tags: Optional[List[str]] = None,
2777
+ exclude_tags: Optional[List[str]] = None,
2778
+ only_main_content: Optional[bool] = None,
2779
+ wait_for: Optional[int] = None,
2780
+ timeout: Optional[int] = None,
2781
+ location: Optional[LocationConfig] = None,
2782
+ mobile: Optional[bool] = None,
2783
+ skip_tls_verification: Optional[bool] = None,
2784
+ remove_base64_images: Optional[bool] = None,
2785
+ block_ads: Optional[bool] = None,
2786
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2787
+ extract: Optional[JsonConfig] = None,
2788
+ json_options: Optional[JsonConfig] = None,
2789
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2790
+ **kwargs) -> ScrapeResponse[Any]:
2791
+ """
2792
+ Scrape a single URL asynchronously.
2793
+
2794
+ Args:
2795
+ url (str): Target URL to scrape
2796
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2797
+ include_tags (Optional[List[str]]): HTML tags to include
2798
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2799
+ only_main_content (Optional[bool]): Extract main content only
2800
+ wait_for (Optional[int]): Wait for a specific element to appear
2801
+ timeout (Optional[int]): Request timeout (ms)
2802
+ location (Optional[LocationConfig]): Location configuration
2803
+ mobile (Optional[bool]): Use mobile user agent
2804
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2805
+ remove_base64_images (Optional[bool]): Remove base64 images
2806
+ block_ads (Optional[bool]): Block ads
2807
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2808
+ extract (Optional[JsonConfig]): Content extraction settings
2809
+ json_options (Optional[JsonConfig]): JSON extraction settings
2810
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2811
+ **kwargs: Additional parameters to pass to the API
2812
+
2813
+ Returns:
2814
+ ScrapeResponse with:
2815
+ * success - Whether scrape was successful
2816
+ * markdown - Markdown content if requested
2817
+ * html - HTML content if requested
2818
+ * rawHtml - Raw HTML content if requested
2819
+ * links - Extracted links if requested
2820
+ * screenshot - Screenshot if requested
2821
+ * extract - Extracted data if requested
2822
+ * json - JSON data if requested
2823
+ * error - Error message if scrape failed
2824
+
2825
+ Raises:
2826
+ Exception: If scraping fails
2827
+ """
2828
+ # Validate any additional kwargs
2829
+ self._validate_kwargs(kwargs, "scrape_url")
2830
+
2831
+ headers = self._prepare_headers()
2832
+
2833
+ # Build scrape parameters
2834
+ scrape_params = {
2835
+ 'url': url,
2836
+ 'origin': f"python-sdk@{version}"
2837
+ }
2838
+
2839
+ # Add optional parameters if provided and not None
2840
+ if formats:
2841
+ scrape_params['formats'] = formats
2842
+ if include_tags:
2843
+ scrape_params['includeTags'] = include_tags
2844
+ if exclude_tags:
2845
+ scrape_params['excludeTags'] = exclude_tags
2846
+ if only_main_content is not None:
2847
+ scrape_params['onlyMainContent'] = only_main_content
2848
+ if wait_for:
2849
+ scrape_params['waitFor'] = wait_for
2850
+ if timeout:
2851
+ scrape_params['timeout'] = timeout
2852
+ if location:
2853
+ scrape_params['location'] = location.dict(exclude_none=True)
2854
+ if mobile is not None:
2855
+ scrape_params['mobile'] = mobile
2856
+ if skip_tls_verification is not None:
2857
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2858
+ if remove_base64_images is not None:
2859
+ scrape_params['removeBase64Images'] = remove_base64_images
2860
+ if block_ads is not None:
2861
+ scrape_params['blockAds'] = block_ads
2862
+ if proxy:
2863
+ scrape_params['proxy'] = proxy
2864
+ if extract:
2865
+ extract_dict = extract.dict(exclude_none=True)
2866
+ if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
2867
+ extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
2868
+ scrape_params['extract'] = extract_dict
2869
+ if json_options:
2870
+ json_options_dict = json_options.dict(exclude_none=True)
2871
+ if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
2872
+ json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
2873
+ scrape_params['jsonOptions'] = json_options_dict
2874
+ if actions:
2875
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2876
+
2877
+ # Make async request
2878
+ endpoint = f'/v1/scrape'
2879
+ response = await self._async_post_request(
2880
+ f'{self.api_url}{endpoint}',
2881
+ scrape_params,
2882
+ headers
2883
+ )
2884
+
2885
+ if response.get('success') and 'data' in response:
2886
+ return ScrapeResponse(**response['data'])
2887
+ elif "error" in response:
2888
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2889
+ else:
2890
+ # Use the response content directly if possible, otherwise a generic message
2891
+ error_content = response.get('error', str(response))
2892
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
2893
+
2894
+ async def batch_scrape_urls(
2895
+ self,
2896
+ urls: List[str],
2897
+ *,
2898
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2899
+ headers: Optional[Dict[str, str]] = None,
2900
+ include_tags: Optional[List[str]] = None,
2901
+ exclude_tags: Optional[List[str]] = None,
2902
+ only_main_content: Optional[bool] = None,
2903
+ wait_for: Optional[int] = None,
2904
+ timeout: Optional[int] = None,
2905
+ location: Optional[LocationConfig] = None,
2906
+ mobile: Optional[bool] = None,
2907
+ skip_tls_verification: Optional[bool] = None,
2908
+ remove_base64_images: Optional[bool] = None,
2909
+ block_ads: Optional[bool] = None,
2910
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2911
+ extract: Optional[JsonConfig] = None,
2912
+ json_options: Optional[JsonConfig] = None,
2913
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2914
+ agent: Optional[AgentOptions] = None,
2915
+ poll_interval: Optional[int] = 2,
2916
+ idempotency_key: Optional[str] = None,
2917
+ **kwargs
2918
+ ) -> BatchScrapeStatusResponse:
2919
+ """
2920
+ Asynchronously scrape multiple URLs and monitor until completion.
2921
+
2922
+ Args:
2923
+ urls (List[str]): URLs to scrape
2924
+ formats (Optional[List[Literal]]): Content formats to retrieve
2925
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2926
+ include_tags (Optional[List[str]]): HTML tags to include
2927
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2928
+ only_main_content (Optional[bool]): Extract main content only
2929
+ wait_for (Optional[int]): Wait time in milliseconds
2930
+ timeout (Optional[int]): Request timeout in milliseconds
2931
+ location (Optional[LocationConfig]): Location configuration
2932
+ mobile (Optional[bool]): Use mobile user agent
2933
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2934
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2935
+ block_ads (Optional[bool]): Block advertisements
2936
+ proxy (Optional[Literal]): Proxy type to use
2937
+ extract (Optional[JsonConfig]): Content extraction config
2938
+ json_options (Optional[JsonConfig]): JSON extraction config
2939
+ actions (Optional[List[Union]]): Actions to perform
2940
+ agent (Optional[AgentOptions]): Agent configuration
2941
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
2942
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2943
+ **kwargs: Additional parameters to pass to the API
2944
+
2945
+ Returns:
2946
+ BatchScrapeStatusResponse with:
2947
+ * Scraping status and progress
2948
+ * Scraped content for each URL
2949
+ * Success/error information
2950
+
2951
+ Raises:
2952
+ Exception: If batch scrape fails
2953
+ """
2954
+ # Validate any additional kwargs
2955
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
2956
+
2957
+ scrape_params = {}
2958
+
2959
+ # Add individual parameters
2960
+ if formats is not None:
2961
+ scrape_params['formats'] = formats
2962
+ if headers is not None:
2963
+ scrape_params['headers'] = headers
2964
+ if include_tags is not None:
2965
+ scrape_params['includeTags'] = include_tags
2966
+ if exclude_tags is not None:
2967
+ scrape_params['excludeTags'] = exclude_tags
2968
+ if only_main_content is not None:
2969
+ scrape_params['onlyMainContent'] = only_main_content
2970
+ if wait_for is not None:
2971
+ scrape_params['waitFor'] = wait_for
2972
+ if timeout is not None:
2973
+ scrape_params['timeout'] = timeout
2974
+ if location is not None:
2975
+ scrape_params['location'] = location.dict(exclude_none=True)
2976
+ if mobile is not None:
2977
+ scrape_params['mobile'] = mobile
2978
+ if skip_tls_verification is not None:
2979
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2980
+ if remove_base64_images is not None:
2981
+ scrape_params['removeBase64Images'] = remove_base64_images
2982
+ if block_ads is not None:
2983
+ scrape_params['blockAds'] = block_ads
2984
+ if proxy is not None:
2985
+ scrape_params['proxy'] = proxy
2986
+ if extract is not None:
2987
+ if hasattr(extract.schema, 'schema'):
2988
+ extract.schema = extract.schema.schema()
2989
+ scrape_params['extract'] = extract.dict(exclude_none=True)
2990
+ if json_options is not None:
2991
+ if hasattr(json_options.schema, 'schema'):
2992
+ json_options.schema = json_options.schema.schema()
2993
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
2994
+ if actions is not None:
2995
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2996
+ if agent is not None:
2997
+ scrape_params['agent'] = agent.dict(exclude_none=True)
2998
+
2999
+ # Add any additional kwargs
3000
+ scrape_params.update(kwargs)
3001
+
3002
+ # Create final params object
3003
+ final_params = ScrapeParams(**scrape_params)
3004
+ params_dict = final_params.dict(exclude_none=True)
3005
+ params_dict['urls'] = urls
3006
+ params_dict['origin'] = f"python-sdk@{version}"
3007
+
3008
+ # Make request
3009
+ headers = self._prepare_headers(idempotency_key)
3010
+ response = await self._async_post_request(
3011
+ f'{self.api_url}/v1/batch/scrape',
3012
+ params_dict,
3013
+ headers
3014
+ )
3015
+
3016
+ if response.get('success'):
3017
+ try:
3018
+ id = response.get('id')
3019
+ except:
3020
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3021
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3022
+ else:
3023
+ self._handle_error(response, 'start batch scrape job')
3024
+
3025
+
3026
+ async def async_batch_scrape_urls(
3027
+ self,
3028
+ urls: List[str],
3029
+ *,
3030
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3031
+ headers: Optional[Dict[str, str]] = None,
3032
+ include_tags: Optional[List[str]] = None,
3033
+ exclude_tags: Optional[List[str]] = None,
3034
+ only_main_content: Optional[bool] = None,
3035
+ wait_for: Optional[int] = None,
3036
+ timeout: Optional[int] = None,
3037
+ location: Optional[LocationConfig] = None,
3038
+ mobile: Optional[bool] = None,
3039
+ skip_tls_verification: Optional[bool] = None,
3040
+ remove_base64_images: Optional[bool] = None,
3041
+ block_ads: Optional[bool] = None,
3042
+ proxy: Optional[Literal["basic", "stealth"]] = None,
3043
+ extract: Optional[JsonConfig] = None,
3044
+ json_options: Optional[JsonConfig] = None,
3045
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3046
+ agent: Optional[AgentOptions] = None,
3047
+ idempotency_key: Optional[str] = None,
3048
+ **kwargs
3049
+ ) -> BatchScrapeResponse:
3050
+ """
3051
+ Initiate a batch scrape job asynchronously.
3052
+
3053
+ Args:
3054
+ urls (List[str]): URLs to scrape
3055
+ formats (Optional[List[Literal]]): Content formats to retrieve
3056
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3057
+ include_tags (Optional[List[str]]): HTML tags to include
3058
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3059
+ only_main_content (Optional[bool]): Extract main content only
3060
+ wait_for (Optional[int]): Wait time in milliseconds
3061
+ timeout (Optional[int]): Request timeout in milliseconds
3062
+ location (Optional[LocationConfig]): Location configuration
3063
+ mobile (Optional[bool]): Use mobile user agent
3064
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3065
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3066
+ block_ads (Optional[bool]): Block advertisements
3067
+ proxy (Optional[Literal]): Proxy type to use
3068
+ extract (Optional[JsonConfig]): Content extraction config
3069
+ json_options (Optional[JsonConfig]): JSON extraction config
3070
+ actions (Optional[List[Union]]): Actions to perform
3071
+ agent (Optional[AgentOptions]): Agent configuration
3072
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3073
+ **kwargs: Additional parameters to pass to the API
3074
+
3075
+ Returns:
3076
+ BatchScrapeResponse with:
3077
+ * success - Whether job started successfully
3078
+ * id - Unique identifier for the job
3079
+ * url - Status check URL
3080
+ * error - Error message if start failed
3081
+
3082
+ Raises:
3083
+ Exception: If job initiation fails
3084
+ """
3085
+ # Validate any additional kwargs
3086
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3087
+
3088
+ scrape_params = {}
3089
+
3090
+ # Add individual parameters
3091
+ if formats is not None:
3092
+ scrape_params['formats'] = formats
3093
+ if headers is not None:
3094
+ scrape_params['headers'] = headers
3095
+ if include_tags is not None:
3096
+ scrape_params['includeTags'] = include_tags
3097
+ if exclude_tags is not None:
3098
+ scrape_params['excludeTags'] = exclude_tags
3099
+ if only_main_content is not None:
3100
+ scrape_params['onlyMainContent'] = only_main_content
3101
+ if wait_for is not None:
3102
+ scrape_params['waitFor'] = wait_for
3103
+ if timeout is not None:
3104
+ scrape_params['timeout'] = timeout
3105
+ if location is not None:
3106
+ scrape_params['location'] = location.dict(exclude_none=True)
3107
+ if mobile is not None:
3108
+ scrape_params['mobile'] = mobile
3109
+ if skip_tls_verification is not None:
3110
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3111
+ if remove_base64_images is not None:
3112
+ scrape_params['removeBase64Images'] = remove_base64_images
3113
+ if block_ads is not None:
3114
+ scrape_params['blockAds'] = block_ads
3115
+ if proxy is not None:
3116
+ scrape_params['proxy'] = proxy
3117
+ if extract is not None:
3118
+ if hasattr(extract.schema, 'schema'):
3119
+ extract.schema = extract.schema.schema()
3120
+ scrape_params['extract'] = extract.dict(exclude_none=True)
3121
+ if json_options is not None:
3122
+ if hasattr(json_options.schema, 'schema'):
3123
+ json_options.schema = json_options.schema.schema()
3124
+ scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
3125
+ if actions is not None:
3126
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3127
+ if agent is not None:
3128
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3129
+
3130
+ # Add any additional kwargs
3131
+ scrape_params.update(kwargs)
3132
+
3133
+ # Create final params object
3134
+ final_params = ScrapeParams(**scrape_params)
3135
+ params_dict = final_params.dict(exclude_none=True)
3136
+ params_dict['urls'] = urls
3137
+ params_dict['origin'] = f"python-sdk@{version}"
3138
+
3139
+ # Make request
3140
+ headers = self._prepare_headers(idempotency_key)
3141
+ response = await self._async_post_request(
3142
+ f'{self.api_url}/v1/batch/scrape',
3143
+ params_dict,
3144
+ headers
3145
+ )
3146
+
3147
+ if response.get('status_code') == 200:
3148
+ try:
3149
+ return BatchScrapeResponse(**response.json())
3150
+ except:
3151
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3152
+ else:
3153
+ self._handle_error(response, 'start batch scrape job')
3154
+
3155
+ async def crawl_url(
3156
+ self,
3157
+ url: str,
3158
+ *,
3159
+ include_paths: Optional[List[str]] = None,
3160
+ exclude_paths: Optional[List[str]] = None,
3161
+ max_depth: Optional[int] = None,
3162
+ max_discovery_depth: Optional[int] = None,
3163
+ limit: Optional[int] = None,
3164
+ allow_backward_links: Optional[bool] = None,
3165
+ allow_external_links: Optional[bool] = None,
3166
+ ignore_sitemap: Optional[bool] = None,
3167
+ scrape_options: Optional[ScrapeOptions] = None,
3168
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3169
+ deduplicate_similar_urls: Optional[bool] = None,
3170
+ ignore_query_parameters: Optional[bool] = None,
3171
+ regex_on_full_url: Optional[bool] = None,
3172
+ poll_interval: Optional[int] = 2,
3173
+ idempotency_key: Optional[str] = None,
3174
+ **kwargs
3175
+ ) -> CrawlStatusResponse:
3176
+ """
3177
+ Crawl a website starting from a URL.
3178
+
3179
+ Args:
3180
+ url (str): Target URL to start crawling from
3181
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3182
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3183
+ max_depth (Optional[int]): Maximum crawl depth
3184
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3185
+ limit (Optional[int]): Maximum pages to crawl
3186
+ allow_backward_links (Optional[bool]): Follow parent directory links
3187
+ allow_external_links (Optional[bool]): Follow external domain links
3188
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3189
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3190
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3191
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3192
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3193
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3194
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3195
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3196
+ **kwargs: Additional parameters to pass to the API
3197
+
3198
+ Returns:
3199
+ CrawlStatusResponse with:
3200
+ * Crawling status and progress
3201
+ * Crawled page contents
3202
+ * Success/error information
3203
+
3204
+ Raises:
3205
+ Exception: If crawl fails
3206
+ """
3207
+ # Validate any additional kwargs
3208
+ self._validate_kwargs(kwargs, "crawl_url")
3209
+
3210
+ crawl_params = {}
3211
+
3212
+ # Add individual parameters
3213
+ if include_paths is not None:
3214
+ crawl_params['includePaths'] = include_paths
3215
+ if exclude_paths is not None:
3216
+ crawl_params['excludePaths'] = exclude_paths
3217
+ if max_depth is not None:
3218
+ crawl_params['maxDepth'] = max_depth
3219
+ if max_discovery_depth is not None:
3220
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3221
+ if limit is not None:
3222
+ crawl_params['limit'] = limit
3223
+ if allow_backward_links is not None:
3224
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3225
+ if allow_external_links is not None:
3226
+ crawl_params['allowExternalLinks'] = allow_external_links
3227
+ if ignore_sitemap is not None:
3228
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3229
+ if scrape_options is not None:
3230
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3231
+ if webhook is not None:
3232
+ crawl_params['webhook'] = webhook
3233
+ if deduplicate_similar_urls is not None:
3234
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3235
+ if ignore_query_parameters is not None:
3236
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3237
+ if regex_on_full_url is not None:
3238
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3239
+
3240
+ # Add any additional kwargs
3241
+ crawl_params.update(kwargs)
3242
+
3243
+ # Create final params object
3244
+ final_params = CrawlParams(**crawl_params)
3245
+ params_dict = final_params.dict(exclude_none=True)
3246
+ params_dict['url'] = url
3247
+ params_dict['origin'] = f"python-sdk@{version}"
3248
+ # Make request
3249
+ headers = self._prepare_headers(idempotency_key)
3250
+ response = await self._async_post_request(
3251
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3252
+
3253
+ if response.get('success'):
3254
+ try:
3255
+ id = response.get('id')
3256
+ except:
3257
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3258
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3259
+ else:
3260
+ self._handle_error(response, 'start crawl job')
3261
+
3262
+
3263
+ async def async_crawl_url(
3264
+ self,
3265
+ url: str,
3266
+ *,
3267
+ include_paths: Optional[List[str]] = None,
3268
+ exclude_paths: Optional[List[str]] = None,
3269
+ max_depth: Optional[int] = None,
3270
+ max_discovery_depth: Optional[int] = None,
3271
+ limit: Optional[int] = None,
3272
+ allow_backward_links: Optional[bool] = None,
3273
+ allow_external_links: Optional[bool] = None,
3274
+ ignore_sitemap: Optional[bool] = None,
3275
+ scrape_options: Optional[ScrapeOptions] = None,
3276
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3277
+ deduplicate_similar_urls: Optional[bool] = None,
3278
+ ignore_query_parameters: Optional[bool] = None,
3279
+ regex_on_full_url: Optional[bool] = None,
3280
+ poll_interval: Optional[int] = 2,
3281
+ idempotency_key: Optional[str] = None,
3282
+ **kwargs
3283
+ ) -> CrawlResponse:
3284
+ """
3285
+ Start an asynchronous crawl job.
3286
+
3287
+ Args:
3288
+ url (str): Target URL to start crawling from
3289
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3290
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3291
+ max_depth (Optional[int]): Maximum crawl depth
3292
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3293
+ limit (Optional[int]): Maximum pages to crawl
3294
+ allow_backward_links (Optional[bool]): Follow parent directory links
3295
+ allow_external_links (Optional[bool]): Follow external domain links
3296
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3297
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3298
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3299
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3300
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3301
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3302
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3303
+ **kwargs: Additional parameters to pass to the API
3304
+
3305
+ Returns:
3306
+ CrawlResponse with:
3307
+ * success - Whether crawl started successfully
3308
+ * id - Unique identifier for the crawl job
3309
+ * url - Status check URL for the crawl
3310
+ * error - Error message if start failed
3311
+
3312
+ Raises:
3313
+ Exception: If crawl initiation fails
3314
+ """
3315
+ crawl_params = {}
3316
+
3317
+ # Add individual parameters
3318
+ if include_paths is not None:
3319
+ crawl_params['includePaths'] = include_paths
3320
+ if exclude_paths is not None:
3321
+ crawl_params['excludePaths'] = exclude_paths
3322
+ if max_depth is not None:
3323
+ crawl_params['maxDepth'] = max_depth
3324
+ if max_discovery_depth is not None:
3325
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3326
+ if limit is not None:
3327
+ crawl_params['limit'] = limit
3328
+ if allow_backward_links is not None:
3329
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3330
+ if allow_external_links is not None:
3331
+ crawl_params['allowExternalLinks'] = allow_external_links
3332
+ if ignore_sitemap is not None:
3333
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3334
+ if scrape_options is not None:
3335
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3336
+ if webhook is not None:
3337
+ crawl_params['webhook'] = webhook
3338
+ if deduplicate_similar_urls is not None:
3339
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3340
+ if ignore_query_parameters is not None:
3341
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3342
+ if regex_on_full_url is not None:
3343
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3344
+
3345
+ # Add any additional kwargs
3346
+ crawl_params.update(kwargs)
3347
+
3348
+ # Create final params object
3349
+ final_params = CrawlParams(**crawl_params)
3350
+ params_dict = final_params.dict(exclude_none=True)
3351
+ params_dict['url'] = url
3352
+ params_dict['origin'] = f"python-sdk@{version}"
3353
+
3354
+ # Make request
3355
+ headers = self._prepare_headers(idempotency_key)
3356
+ response = await self._async_post_request(
3357
+ f'{self.api_url}/v1/crawl',
3358
+ params_dict,
3359
+ headers
3360
+ )
3361
+
3362
+ if response.get('success'):
3363
+ try:
3364
+ return CrawlResponse(**response)
3365
+ except:
3366
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3367
+ else:
3368
+ self._handle_error(response, 'start crawl job')
3369
+
3370
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3371
+ """
3372
+ Check the status and results of an asynchronous crawl job.
3373
+
3374
+ Args:
3375
+ id (str): Unique identifier for the crawl job
3376
+
3377
+ Returns:
3378
+ CrawlStatusResponse containing:
3379
+ Status Information:
3380
+ * status - Current state (scraping/completed/failed/cancelled)
3381
+ * completed - Number of pages crawled
3382
+ * total - Total pages to crawl
3383
+ * creditsUsed - API credits consumed
3384
+ * expiresAt - Data expiration timestamp
3385
+
3386
+ Results:
3387
+ * data - List of crawled documents
3388
+ * next - URL for next page of results (if paginated)
3389
+ * success - Whether status check succeeded
3390
+ * error - Error message if failed
3391
+
3392
+ Raises:
3393
+ Exception: If status check fails
3394
+ """
3395
+ headers = self._prepare_headers()
3396
+ endpoint = f'/v1/crawl/{id}'
3397
+
3398
+ status_data = await self._async_get_request(
3399
+ f'{self.api_url}{endpoint}',
3400
+ headers
3401
+ )
3402
+
3403
+ if status_data.get('status') == 'completed':
3404
+ if 'data' in status_data:
3405
+ data = status_data['data']
3406
+ while 'next' in status_data:
3407
+ if len(status_data['data']) == 0:
3408
+ break
3409
+ next_url = status_data.get('next')
3410
+ if not next_url:
3411
+ logger.warning("Expected 'next' URL is missing.")
3412
+ break
3413
+ next_data = await self._async_get_request(next_url, headers)
3414
+ data.extend(next_data.get('data', []))
3415
+ status_data = next_data
3416
+ status_data['data'] = data
3417
+ # Create CrawlStatusResponse object from status data
3418
+ response = CrawlStatusResponse(
3419
+ status=status_data.get('status'),
3420
+ total=status_data.get('total'),
3421
+ completed=status_data.get('completed'),
3422
+ creditsUsed=status_data.get('creditsUsed'),
3423
+ expiresAt=status_data.get('expiresAt'),
3424
+ data=status_data.get('data'),
3425
+ success=False if 'error' in status_data else True
3426
+ )
3427
+
3428
+ if 'error' in status_data:
3429
+ response.error = status_data.get('error')
3430
+
3431
+ if 'next' in status_data:
3432
+ response.next = status_data.get('next')
3433
+
3434
+ return response
3435
+
3436
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3437
+ """
3438
+ Monitor the status of an asynchronous job until completion.
3439
+
3440
+ Args:
3441
+ id (str): The ID of the job to monitor
3442
+ headers (Dict[str, str]): Headers to include in status check requests
3443
+ poll_interval (int): Seconds between status checks (default: 2)
3444
+
3445
+ Returns:
3446
+ CrawlStatusResponse: The job results if completed successfully
3447
+
3448
+ Raises:
3449
+ Exception: If the job fails or an error occurs during status checks
3450
+ """
3451
+ while True:
3452
+ status_data = await self._async_get_request(
3453
+ f'{self.api_url}/v1/crawl/{id}',
3454
+ headers
3455
+ )
3456
+
3457
+ if status_data.get('status') == 'completed':
3458
+ if 'data' in status_data:
3459
+ data = status_data['data']
3460
+ while 'next' in status_data:
3461
+ if len(status_data['data']) == 0:
3462
+ break
3463
+ next_url = status_data.get('next')
3464
+ if not next_url:
3465
+ logger.warning("Expected 'next' URL is missing.")
3466
+ break
3467
+ next_data = await self._async_get_request(next_url, headers)
3468
+ data.extend(next_data.get('data', []))
3469
+ status_data = next_data
3470
+ status_data['data'] = data
3471
+ return CrawlStatusResponse(**status_data)
3472
+ else:
3473
+ raise Exception('Job completed but no data was returned')
3474
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3475
+ await asyncio.sleep(max(poll_interval, 2))
3476
+ else:
3477
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3478
+
3479
+ async def map_url(
3480
+ self,
3481
+ url: str,
3482
+ *,
3483
+ search: Optional[str] = None,
3484
+ ignore_sitemap: Optional[bool] = None,
3485
+ include_subdomains: Optional[bool] = None,
3486
+ sitemap_only: Optional[bool] = None,
3487
+ limit: Optional[int] = None,
3488
+ timeout: Optional[int] = None,
3489
+ params: Optional[MapParams] = None) -> MapResponse:
3490
+ """
3491
+ Asynchronously map and discover links from a URL.
3492
+
3493
+ Args:
3494
+ url (str): Target URL to map
3495
+ params (Optional[MapParams]): See MapParams model:
3496
+ Discovery Options:
3497
+ * search - Filter pattern for URLs
3498
+ * ignoreSitemap - Skip sitemap.xml
3499
+ * includeSubdomains - Include subdomain links
3500
+ * sitemapOnly - Only use sitemap.xml
3501
+
3502
+ Limits:
3503
+ * limit - Max URLs to return
3504
+ * timeout - Request timeout (ms)
3505
+
3506
+ Returns:
3507
+ MapResponse with:
3508
+ * Discovered URLs
3509
+ * Success/error status
3510
+
3511
+ Raises:
3512
+ Exception: If mapping fails
3513
+ """
3514
+ map_params = {}
3515
+ if params:
3516
+ map_params.update(params.dict(exclude_none=True))
3517
+
3518
+ # Add individual parameters
3519
+ if search is not None:
3520
+ map_params['search'] = search
3521
+ if ignore_sitemap is not None:
3522
+ map_params['ignoreSitemap'] = ignore_sitemap
3523
+ if include_subdomains is not None:
3524
+ map_params['includeSubdomains'] = include_subdomains
3525
+ if sitemap_only is not None:
3526
+ map_params['sitemapOnly'] = sitemap_only
3527
+ if limit is not None:
3528
+ map_params['limit'] = limit
3529
+ if timeout is not None:
3530
+ map_params['timeout'] = timeout
3531
+
3532
+ # Create final params object
3533
+ final_params = MapParams(**map_params)
3534
+ params_dict = final_params.dict(exclude_none=True)
3535
+ params_dict['url'] = url
3536
+ params_dict['origin'] = f"python-sdk@{version}"
3537
+
3538
+ # Make request
3539
+ endpoint = f'/v1/map'
3540
+ response = await self._async_post_request(
3541
+ f'{self.api_url}{endpoint}',
3542
+ params_dict,
3543
+ headers={"Authorization": f"Bearer {self.api_key}"}
3544
+ )
3545
+
3546
+ if response.get('success') and 'links' in response:
3547
+ return MapResponse(**response)
3548
+ elif 'error' in response:
3549
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3550
+ else:
3551
+ raise Exception(f'Failed to map URL. Error: {response}')
3552
+
3553
+ async def extract(
3554
+ self,
3555
+ urls: Optional[List[str]] = None,
3556
+ *,
3557
+ prompt: Optional[str] = None,
3558
+ schema: Optional[Any] = None,
3559
+ system_prompt: Optional[str] = None,
3560
+ allow_external_links: Optional[bool] = False,
3561
+ enable_web_search: Optional[bool] = False,
3562
+ show_sources: Optional[bool] = False,
3563
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3564
+
3565
+ """
3566
+ Asynchronously extract structured information from URLs.
3567
+
3568
+ Args:
3569
+ urls (Optional[List[str]]): URLs to extract from
3570
+ prompt (Optional[str]): Custom extraction prompt
3571
+ schema (Optional[Any]): JSON schema/Pydantic model
3572
+ system_prompt (Optional[str]): System context
3573
+ allow_external_links (Optional[bool]): Follow external links
3574
+ enable_web_search (Optional[bool]): Enable web search
3575
+ show_sources (Optional[bool]): Include source URLs
3576
+ agent (Optional[Dict[str, Any]]): Agent configuration
3577
+
3578
+ Returns:
3579
+ ExtractResponse with:
3580
+ * Structured data matching schema
3581
+ * Source information if requested
3582
+ * Success/error status
3583
+
3584
+ Raises:
3585
+ ValueError: If prompt/schema missing or extraction fails
3586
+ """
3587
+ headers = self._prepare_headers()
3588
+
3589
+ if not prompt and not schema:
3590
+ raise ValueError("Either prompt or schema is required")
3591
+
3592
+ if not urls and not prompt:
3593
+ raise ValueError("Either urls or prompt is required")
3594
+
3595
+ if schema:
3596
+ if hasattr(schema, 'model_json_schema'):
3597
+ # Convert Pydantic model to JSON schema
3598
+ schema = schema.model_json_schema()
3599
+ # Otherwise assume it's already a JSON schema dict
3600
+
3601
+ request_data = {
3602
+ 'urls': urls or [],
3603
+ 'allowExternalLinks': allow_external_links,
3604
+ 'enableWebSearch': enable_web_search,
3605
+ 'showSources': show_sources,
3606
+ 'schema': schema,
3607
+ 'origin': f'python-sdk@{get_version()}'
3608
+ }
3609
+
3610
+ # Only add prompt and systemPrompt if they exist
3611
+ if prompt:
3612
+ request_data['prompt'] = prompt
3613
+ if system_prompt:
3614
+ request_data['systemPrompt'] = system_prompt
3615
+
3616
+ if agent:
3617
+ request_data['agent'] = agent
3618
+
3619
+ response = await self._async_post_request(
3620
+ f'{self.api_url}/v1/extract',
3621
+ request_data,
3622
+ headers
3623
+ )
3624
+
3625
+ if response.get('success'):
3626
+ job_id = response.get('id')
3627
+ if not job_id:
3628
+ raise Exception('Job ID not returned from extract request.')
3629
+
3630
+ while True:
3631
+ status_data = await self._async_get_request(
3632
+ f'{self.api_url}/v1/extract/{job_id}',
3633
+ headers
3634
+ )
3635
+
3636
+ if status_data['status'] == 'completed':
3637
+ return ExtractResponse(**status_data)
3638
+ elif status_data['status'] in ['failed', 'cancelled']:
3639
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3640
+
3641
+ await asyncio.sleep(2)
3642
+ else:
3643
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3644
+
3645
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3646
+ """
3647
+ Check the status of an asynchronous batch scrape job.
3648
+
3649
+ Args:
3650
+ id (str): The ID of the batch scrape job
3651
+
3652
+ Returns:
3653
+ BatchScrapeStatusResponse containing:
3654
+ Status Information:
3655
+ * status - Current state (scraping/completed/failed/cancelled)
3656
+ * completed - Number of URLs scraped
3657
+ * total - Total URLs to scrape
3658
+ * creditsUsed - API credits consumed
3659
+ * expiresAt - Data expiration timestamp
3660
+
3661
+ Results:
3662
+ * data - List of scraped documents
3663
+ * next - URL for next page of results (if paginated)
3664
+ * success - Whether status check succeeded
3665
+ * error - Error message if failed
3666
+
3667
+ Raises:
3668
+ Exception: If status check fails
3669
+ """
3670
+ headers = self._prepare_headers()
3671
+ endpoint = f'/v1/batch/scrape/{id}'
3672
+
3673
+ status_data = await self._async_get_request(
3674
+ f'{self.api_url}{endpoint}',
3675
+ headers
3676
+ )
3677
+
3678
+ if status_data['status'] == 'completed':
3679
+ if 'data' in status_data:
3680
+ data = status_data['data']
3681
+ while 'next' in status_data:
3682
+ if len(status_data['data']) == 0:
3683
+ break
3684
+ next_url = status_data.get('next')
3685
+ if not next_url:
3686
+ logger.warning("Expected 'next' URL is missing.")
3687
+ break
3688
+ next_data = await self._async_get_request(next_url, headers)
3689
+ data.extend(next_data.get('data', []))
3690
+ status_data = next_data
3691
+ status_data['data'] = data
3692
+
3693
+ response = BatchScrapeStatusResponse(
3694
+ status=status_data.get('status'),
3695
+ total=status_data.get('total'),
3696
+ completed=status_data.get('completed'),
3697
+ creditsUsed=status_data.get('creditsUsed'),
3698
+ expiresAt=status_data.get('expiresAt'),
3699
+ data=status_data.get('data')
3700
+ )
3701
+
3702
+ if 'error' in status_data:
3703
+ response['error'] = status_data['error']
3704
+
3705
+ if 'next' in status_data:
3706
+ response['next'] = status_data['next']
3707
+
3708
+ return {
3709
+ 'success': False if 'error' in status_data else True,
3710
+ **response
3711
+ }
3712
+
3713
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3714
+ """
3715
+ Get information about errors from an asynchronous batch scrape job.
3716
+
3717
+ Args:
3718
+ id (str): The ID of the batch scrape job
3719
+
3720
+ Returns:
3721
+ CrawlErrorsResponse containing:
3722
+ errors (List[Dict[str, str]]): List of errors with fields:
3723
+ * id (str): Error ID
3724
+ * timestamp (str): When the error occurred
3725
+ * url (str): URL that caused the error
3726
+ * error (str): Error message
3727
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3728
+
3729
+ Raises:
3730
+ Exception: If error check fails
3731
+ """
3732
+ headers = self._prepare_headers()
3733
+ return await self._async_get_request(
3734
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3735
+ headers
3736
+ )
3737
+
3738
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3739
+ """
3740
+ Get information about errors from an asynchronous crawl job.
3741
+
3742
+ Args:
3743
+ id (str): The ID of the crawl job
3744
+
3745
+ Returns:
3746
+ CrawlErrorsResponse containing:
3747
+ * errors (List[Dict[str, str]]): List of errors with fields:
3748
+ - id (str): Error ID
3749
+ - timestamp (str): When the error occurred
3750
+ - url (str): URL that caused the error
3751
+ - error (str): Error message
3752
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3753
+
3754
+ Raises:
3755
+ Exception: If error check fails
3756
+ """
3757
+ headers = self._prepare_headers()
3758
+ return await self._async_get_request(
3759
+ f'{self.api_url}/v1/crawl/{id}/errors',
3760
+ headers
3761
+ )
3762
+
3763
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3764
+ """
3765
+ Cancel an asynchronous crawl job.
3766
+
3767
+ Args:
3768
+ id (str): The ID of the crawl job to cancel
3769
+
3770
+ Returns:
3771
+ Dict[str, Any] containing:
3772
+ * success (bool): Whether cancellation was successful
3773
+ * error (str, optional): Error message if cancellation failed
3774
+
3775
+ Raises:
3776
+ Exception: If cancellation fails
3777
+ """
3778
+ headers = self._prepare_headers()
3779
+ async with aiohttp.ClientSession() as session:
3780
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3781
+ return await response.json()
3782
+
3783
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3784
+ """
3785
+ Check the status of an asynchronous extraction job.
3786
+
3787
+ Args:
3788
+ job_id (str): The ID of the extraction job
3789
+
3790
+ Returns:
3791
+ ExtractResponse[Any] with:
3792
+ * success (bool): Whether request succeeded
3793
+ * data (Optional[Any]): Extracted data matching schema
3794
+ * error (Optional[str]): Error message if any
3795
+ * warning (Optional[str]): Warning message if any
3796
+ * sources (Optional[List[str]]): Source URLs if requested
3797
+
3798
+ Raises:
3799
+ ValueError: If status check fails
3800
+ """
3801
+ headers = self._prepare_headers()
3802
+ try:
3803
+ return await self._async_get_request(
3804
+ f'{self.api_url}/v1/extract/{job_id}',
3805
+ headers
3806
+ )
3807
+ except Exception as e:
3808
+ raise ValueError(str(e))
3809
+
3810
+ async def async_extract(
3811
+ self,
3812
+ urls: Optional[List[str]] = None,
3813
+ *,
3814
+ prompt: Optional[str] = None,
3815
+ schema: Optional[Any] = None,
3816
+ system_prompt: Optional[str] = None,
3817
+ allow_external_links: Optional[bool] = False,
3818
+ enable_web_search: Optional[bool] = False,
3819
+ show_sources: Optional[bool] = False,
3820
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3821
+ """
3822
+ Initiate an asynchronous extraction job without waiting for completion.
3823
+
3824
+ Args:
3825
+ urls (Optional[List[str]]): URLs to extract from
3826
+ prompt (Optional[str]): Custom extraction prompt
3827
+ schema (Optional[Any]): JSON schema/Pydantic model
3828
+ system_prompt (Optional[str]): System context
3829
+ allow_external_links (Optional[bool]): Follow external links
3830
+ enable_web_search (Optional[bool]): Enable web search
3831
+ show_sources (Optional[bool]): Include source URLs
3832
+ agent (Optional[Dict[str, Any]]): Agent configuration
3833
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3834
+
3835
+ Returns:
3836
+ ExtractResponse[Any] with:
3837
+ * success (bool): Whether request succeeded
3838
+ * data (Optional[Any]): Extracted data matching schema
3839
+ * error (Optional[str]): Error message if any
3840
+
3841
+ Raises:
3842
+ ValueError: If job initiation fails
3843
+ """
3844
+ headers = self._prepare_headers()
3845
+
3846
+ if not prompt and not schema:
3847
+ raise ValueError("Either prompt or schema is required")
3848
+
3849
+ if not urls and not prompt:
3850
+ raise ValueError("Either urls or prompt is required")
3851
+
3852
+ if schema:
3853
+ if hasattr(schema, 'model_json_schema'):
3854
+ schema = schema.model_json_schema()
3855
+
3856
+ request_data = ExtractResponse(
3857
+ urls=urls or [],
3858
+ allowExternalLinks=allow_external_links,
3859
+ enableWebSearch=enable_web_search,
3860
+ showSources=show_sources,
3861
+ schema=schema,
3862
+ origin=f'python-sdk@{version}'
3863
+ )
3864
+
3865
+ if prompt:
3866
+ request_data['prompt'] = prompt
3867
+ if system_prompt:
3868
+ request_data['systemPrompt'] = system_prompt
3869
+ if agent:
3870
+ request_data['agent'] = agent
3871
+
3872
+ try:
3873
+ return await self._async_post_request(
3874
+ f'{self.api_url}/v1/extract',
3875
+ request_data,
3876
+ headers
3877
+ )
3878
+ except Exception as e:
3879
+ raise ValueError(str(e))
3880
+
3881
+ async def generate_llms_text(
3882
+ self,
3883
+ url: str,
3884
+ *,
3885
+ max_urls: Optional[int] = None,
3886
+ show_full_text: Optional[bool] = None,
3887
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3888
+ """
3889
+ Generate LLMs.txt for a given URL and monitor until completion.
3890
+
3891
+ Args:
3892
+ url (str): Target URL to generate LLMs.txt from
3893
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3894
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3895
+ experimental_stream (Optional[bool]): Enable experimental streaming
3896
+
3897
+ Returns:
3898
+ GenerateLLMsTextStatusResponse containing:
3899
+ * success (bool): Whether generation completed successfully
3900
+ * status (str): Status of generation (processing/completed/failed)
3901
+ * data (Dict[str, str], optional): Generated text with fields:
3902
+ - llmstxt (str): Generated LLMs.txt content
3903
+ - llmsfulltxt (str, optional): Full version if requested
3904
+ * error (str, optional): Error message if generation failed
3905
+ * expiresAt (str): When the generated data expires
3906
+
3907
+ Raises:
3908
+ Exception: If generation fails
3909
+ """
3910
+ params = {}
3911
+ if max_urls is not None:
3912
+ params['maxUrls'] = max_urls
3913
+ if show_full_text is not None:
3914
+ params['showFullText'] = show_full_text
3915
+ if experimental_stream is not None:
3916
+ params['__experimental_stream'] = experimental_stream
3917
+
3918
+ response = await self.async_generate_llms_text(
3919
+ url,
3920
+ max_urls=max_urls,
3921
+ show_full_text=show_full_text,
3922
+ experimental_stream=experimental_stream
3923
+ )
3924
+ if not response.get('success') or 'id' not in response:
3925
+ return response
3926
+
3927
+ job_id = response['id']
3928
+ while True:
3929
+ status = await self.check_generate_llms_text_status(job_id)
3930
+
3931
+ if status['status'] == 'completed':
3932
+ return status
3933
+ elif status['status'] == 'failed':
3934
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
3935
+ elif status['status'] != 'processing':
3936
+ break
3937
+
3938
+ await asyncio.sleep(2)
3939
+
3940
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
3941
+
3942
+ async def async_generate_llms_text(
3943
+ self,
3944
+ url: str,
3945
+ *,
3946
+ max_urls: Optional[int] = None,
3947
+ show_full_text: Optional[bool] = None,
3948
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
3949
+ """
3950
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
3951
+
3952
+ Args:
3953
+ url (str): Target URL to generate LLMs.txt from
3954
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3955
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3956
+ experimental_stream (Optional[bool]): Enable experimental streaming
3957
+
3958
+ Returns:
3959
+ GenerateLLMsTextResponse containing:
3960
+ * success (bool): Whether job started successfully
3961
+ * id (str): Unique identifier for the job
3962
+ * error (str, optional): Error message if start failed
3963
+
3964
+ Raises:
3965
+ ValueError: If job initiation fails
3966
+ """
3967
+ params = {}
3968
+ if max_urls is not None:
3969
+ params['maxUrls'] = max_urls
3970
+ if show_full_text is not None:
3971
+ params['showFullText'] = show_full_text
3972
+ if experimental_stream is not None:
3973
+ params['__experimental_stream'] = experimental_stream
3974
+
3975
+ params = GenerateLLMsTextParams(
3976
+ maxUrls=max_urls,
3977
+ showFullText=show_full_text,
3978
+ __experimental_stream=experimental_stream
3979
+ )
3980
+
3981
+ headers = self._prepare_headers()
3982
+ json_data = {'url': url, **params.dict(exclude_none=True)}
3983
+ json_data['origin'] = f"python-sdk@{version}"
3984
+
3985
+ try:
3986
+ return await self._async_post_request(
3987
+ f'{self.api_url}/v1/llmstxt',
3988
+ json_data,
3989
+ headers
3990
+ )
3991
+ except Exception as e:
3992
+ raise ValueError(str(e))
3993
+
3994
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
3995
+ """
3996
+ Check the status of an asynchronous LLMs.txt generation job.
3997
+
3998
+ Args:
3999
+ id (str): The ID of the generation job
4000
+
4001
+ Returns:
4002
+ GenerateLLMsTextStatusResponse containing:
4003
+ * success (bool): Whether generation completed successfully
4004
+ * status (str): Status of generation (processing/completed/failed)
4005
+ * data (Dict[str, str], optional): Generated text with fields:
4006
+ - llmstxt (str): Generated LLMs.txt content
4007
+ - llmsfulltxt (str, optional): Full version if requested
4008
+ * error (str, optional): Error message if generation failed
4009
+ * expiresAt (str): When the generated data expires
4010
+
4011
+ Raises:
4012
+ ValueError: If status check fails
4013
+ """
4014
+ headers = self._prepare_headers()
4015
+ try:
4016
+ return await self._async_get_request(
4017
+ f'{self.api_url}/v1/llmstxt/{id}',
4018
+ headers
4019
+ )
4020
+ except Exception as e:
4021
+ raise ValueError(str(e))
4022
+
4023
+ async def deep_research(
4024
+ self,
4025
+ query: str,
4026
+ *,
4027
+ max_depth: Optional[int] = None,
4028
+ time_limit: Optional[int] = None,
4029
+ max_urls: Optional[int] = None,
4030
+ analysis_prompt: Optional[str] = None,
4031
+ system_prompt: Optional[str] = None,
4032
+ __experimental_stream_steps: Optional[bool] = None,
4033
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4034
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4035
+ """
4036
+ Initiates a deep research operation on a given query and polls until completion.
4037
+
4038
+ Args:
4039
+ query (str): Research query or topic to investigate
4040
+ max_depth (Optional[int]): Maximum depth of research exploration
4041
+ time_limit (Optional[int]): Time limit in seconds for research
4042
+ max_urls (Optional[int]): Maximum number of URLs to process
4043
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4044
+ system_prompt (Optional[str]): Custom system prompt
4045
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4046
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4047
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4048
+
4049
+ Returns:
4050
+ DeepResearchStatusResponse containing:
4051
+ * success (bool): Whether research completed successfully
4052
+ * status (str): Current state (processing/completed/failed)
4053
+ * error (Optional[str]): Error message if failed
4054
+ * id (str): Unique identifier for the research job
4055
+ * data (Any): Research findings and analysis
4056
+ * sources (List[Dict]): List of discovered sources
4057
+ * activities (List[Dict]): Research progress log
4058
+ * summaries (List[str]): Generated research summaries
4059
+
4060
+ Raises:
4061
+ Exception: If research fails
4062
+ """
4063
+ research_params = {}
4064
+ if max_depth is not None:
4065
+ research_params['maxDepth'] = max_depth
4066
+ if time_limit is not None:
4067
+ research_params['timeLimit'] = time_limit
4068
+ if max_urls is not None:
4069
+ research_params['maxUrls'] = max_urls
4070
+ if analysis_prompt is not None:
4071
+ research_params['analysisPrompt'] = analysis_prompt
4072
+ if system_prompt is not None:
4073
+ research_params['systemPrompt'] = system_prompt
4074
+ if __experimental_stream_steps is not None:
4075
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4076
+ research_params = DeepResearchParams(**research_params)
4077
+
4078
+ response = await self.async_deep_research(
4079
+ query,
4080
+ max_depth=max_depth,
4081
+ time_limit=time_limit,
4082
+ max_urls=max_urls,
4083
+ analysis_prompt=analysis_prompt,
4084
+ system_prompt=system_prompt
4085
+ )
4086
+ if not response.get('success') or 'id' not in response:
4087
+ return response
4088
+
4089
+ job_id = response['id']
4090
+ last_activity_count = 0
4091
+ last_source_count = 0
4092
+
4093
+ while True:
4094
+ status = await self.check_deep_research_status(job_id)
4095
+
4096
+ if on_activity and 'activities' in status:
4097
+ new_activities = status['activities'][last_activity_count:]
4098
+ for activity in new_activities:
4099
+ on_activity(activity)
4100
+ last_activity_count = len(status['activities'])
4101
+
4102
+ if on_source and 'sources' in status:
4103
+ new_sources = status['sources'][last_source_count:]
4104
+ for source in new_sources:
4105
+ on_source(source)
4106
+ last_source_count = len(status['sources'])
4107
+
4108
+ if status['status'] == 'completed':
4109
+ return status
4110
+ elif status['status'] == 'failed':
4111
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4112
+ elif status['status'] != 'processing':
4113
+ break
4114
+
4115
+ await asyncio.sleep(2)
4116
+
4117
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4118
+
4119
+ async def async_deep_research(
4120
+ self,
4121
+ query: str,
4122
+ *,
4123
+ max_depth: Optional[int] = None,
4124
+ time_limit: Optional[int] = None,
4125
+ max_urls: Optional[int] = None,
4126
+ analysis_prompt: Optional[str] = None,
4127
+ system_prompt: Optional[str] = None,
4128
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4129
+ """
4130
+ Initiates an asynchronous deep research operation.
4131
+
4132
+ Args:
4133
+ query (str): Research query or topic to investigate
4134
+ max_depth (Optional[int]): Maximum depth of research exploration
4135
+ time_limit (Optional[int]): Time limit in seconds for research
4136
+ max_urls (Optional[int]): Maximum number of URLs to process
4137
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4138
+ system_prompt (Optional[str]): Custom system prompt
4139
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4140
+
4141
+ Returns:
4142
+ Dict[str, Any]: A response containing:
4143
+ * success (bool): Whether the research initiation was successful
4144
+ * id (str): The unique identifier for the research job
4145
+ * error (str, optional): Error message if initiation failed
4146
+
4147
+ Raises:
4148
+ Exception: If the research initiation fails.
4149
+ """
4150
+ research_params = {}
4151
+ if max_depth is not None:
4152
+ research_params['maxDepth'] = max_depth
4153
+ if time_limit is not None:
4154
+ research_params['timeLimit'] = time_limit
4155
+ if max_urls is not None:
4156
+ research_params['maxUrls'] = max_urls
4157
+ if analysis_prompt is not None:
4158
+ research_params['analysisPrompt'] = analysis_prompt
4159
+ if system_prompt is not None:
4160
+ research_params['systemPrompt'] = system_prompt
4161
+ if __experimental_stream_steps is not None:
4162
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4163
+ research_params = DeepResearchParams(**research_params)
4164
+
4165
+ headers = self._prepare_headers()
4166
+
4167
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4168
+ json_data['origin'] = f"python-sdk@{version}"
4169
+
4170
+ try:
4171
+ return await self._async_post_request(
4172
+ f'{self.api_url}/v1/deep-research',
4173
+ json_data,
4174
+ headers
4175
+ )
4176
+ except Exception as e:
4177
+ raise ValueError(str(e))
4178
+
4179
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4180
+ """
4181
+ Check the status of a deep research operation.
4182
+
4183
+ Args:
4184
+ id (str): The ID of the deep research operation.
4185
+
4186
+ Returns:
4187
+ DeepResearchResponse containing:
4188
+
4189
+ Status:
4190
+ * success - Whether research completed successfully
4191
+ * status - Current state (processing/completed/failed)
4192
+ * error - Error message if failed
4193
+
4194
+ Results:
4195
+ * id - Unique identifier for the research job
4196
+ * data - Research findings and analysis
4197
+ * sources - List of discovered sources
4198
+ * activities - Research progress log
4199
+ * summaries - Generated research summaries
4200
+
4201
+ Raises:
4202
+ Exception: If the status check fails.
4203
+ """
4204
+ headers = self._prepare_headers()
4205
+ try:
4206
+ return await self._async_get_request(
4207
+ f'{self.api_url}/v1/deep-research/{id}',
4208
+ headers
4209
+ )
4210
+ except Exception as e:
4211
+ raise ValueError(str(e))
4212
+
4213
+ async def search(
4214
+ self,
4215
+ query: str,
4216
+ *,
4217
+ limit: Optional[int] = None,
4218
+ tbs: Optional[str] = None,
4219
+ filter: Optional[str] = None,
4220
+ lang: Optional[str] = None,
4221
+ country: Optional[str] = None,
4222
+ location: Optional[str] = None,
4223
+ timeout: Optional[int] = None,
4224
+ scrape_options: Optional[ScrapeOptions] = None,
4225
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4226
+ **kwargs) -> SearchResponse:
4227
+ """
4228
+ Asynchronously search for content using Firecrawl.
4229
+
4230
+ Args:
4231
+ query (str): Search query string
4232
+ limit (Optional[int]): Max results (default: 5)
4233
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4234
+ filter (Optional[str]): Custom result filter
4235
+ lang (Optional[str]): Language code (default: "en")
4236
+ country (Optional[str]): Country code (default: "us")
4237
+ location (Optional[str]): Geo-targeting
4238
+ timeout (Optional[int]): Request timeout in milliseconds
4239
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4240
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4241
+ **kwargs: Additional keyword arguments for future compatibility
4242
+
4243
+ Returns:
4244
+ SearchResponse: Response containing:
4245
+ * success (bool): Whether request succeeded
4246
+ * data (List[FirecrawlDocument]): Search results
4247
+ * warning (Optional[str]): Warning message if any
4248
+ * error (Optional[str]): Error message if any
4249
+
4250
+ Raises:
4251
+ Exception: If search fails or response cannot be parsed
4252
+ """
4253
+ # Build search parameters
4254
+ search_params = {}
4255
+ if params:
4256
+ if isinstance(params, dict):
4257
+ search_params.update(params)
4258
+ else:
4259
+ search_params.update(params.dict(exclude_none=True))
4260
+
4261
+ # Add individual parameters
4262
+ if limit is not None:
4263
+ search_params['limit'] = limit
4264
+ if tbs is not None:
4265
+ search_params['tbs'] = tbs
4266
+ if filter is not None:
4267
+ search_params['filter'] = filter
4268
+ if lang is not None:
4269
+ search_params['lang'] = lang
4270
+ if country is not None:
4271
+ search_params['country'] = country
4272
+ if location is not None:
4273
+ search_params['location'] = location
4274
+ if timeout is not None:
4275
+ search_params['timeout'] = timeout
4276
+ if scrape_options is not None:
4277
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4278
+
4279
+ # Add any additional kwargs
4280
+ search_params.update(kwargs)
4281
+
4282
+ # Create final params object
4283
+ final_params = SearchParams(query=query, **search_params)
4284
+ params_dict = final_params.dict(exclude_none=True)
4285
+ params_dict['origin'] = f"python-sdk@{version}"
4286
+
4287
+ return await self._async_post_request(
4288
+ f"{self.api_url}/v1/search",
4289
+ params_dict,
4290
+ {"Authorization": f"Bearer {self.api_key}"}
4291
+ )
4292
+
4293
+ class AsyncCrawlWatcher(CrawlWatcher):
4294
+ """
4295
+ Async version of CrawlWatcher that properly handles async operations.
4296
+ """
4297
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4298
+ super().__init__(id, app)
4299
+
4300
+ async def connect(self) -> None:
4301
+ """
4302
+ Establishes async WebSocket connection and starts listening for messages.
4303
+ """
4304
+ async with websockets.connect(
4305
+ self.ws_url,
4306
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4307
+ ) as websocket:
4308
+ await self._listen(websocket)
4309
+
4310
+ async def _listen(self, websocket) -> None:
4311
+ """
4312
+ Listens for incoming WebSocket messages and handles them asynchronously.
4313
+
4314
+ Args:
4315
+ websocket: The WebSocket connection object
4316
+ """
4317
+ async for message in websocket:
4318
+ msg = json.loads(message)
4319
+ await self._handle_message(msg)
4320
+
4321
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4322
+ """
4323
+ Handles incoming WebSocket messages based on their type asynchronously.
4324
+
4325
+ Args:
4326
+ msg (Dict[str, Any]): The message to handle
4327
+ """
4328
+ if msg['type'] == 'done':
4329
+ self.status = 'completed'
4330
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4331
+ elif msg['type'] == 'error':
4332
+ self.status = 'failed'
4333
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4334
+ elif msg['type'] == 'catchup':
4335
+ self.status = msg['data']['status']
4336
+ self.data.extend(msg['data'].get('data', []))
4337
+ for doc in self.data:
4338
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4339
+ elif msg['type'] == 'document':
4340
+ self.data.append(msg['data'])
4341
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4342
+
4343
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4344
+ """
4345
+ Handle errors from async API responses.
4346
+ """
4347
+ try:
4348
+ error_data = await response.json()
4349
+ error_message = error_data.get('error', 'No error message provided.')
4350
+ error_details = error_data.get('details', 'No additional error details provided.')
4351
+ except:
4352
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4353
+
4354
+ # Use the app's method to get the error message
4355
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4356
+
4357
+ raise aiohttp.ClientError(message)
4358
+
4359
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4360
+ """
4361
+ Generate a standardized error message based on HTTP status code for async operations.
4362
+
4363
+ Args:
4364
+ status_code (int): The HTTP status code from the response
4365
+ action (str): Description of the action that was being performed
4366
+ error_message (str): The error message from the API response
4367
+ error_details (str): Additional error details from the API response
4368
+
4369
+ Returns:
4370
+ str: A formatted error message
4371
+ """
4372
+ return self._get_error_message(status_code, action, error_message, error_details)