firecrawl 2.5.2__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

@@ -0,0 +1,4439 @@
1
+ """
2
+ FirecrawlApp Module
3
+
4
+ This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
+ It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
+ and check the status of these jobs. The module uses requests for HTTP communication
7
+ and handles retries for certain HTTP status codes.
8
+
9
+ Classes:
10
+ - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
+ """
12
+ import logging
13
+ import os
14
+ import time
15
+ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
+ import json
17
+ from datetime import datetime
18
+ import re
19
+ import warnings
20
+ import requests
21
+ import pydantic
22
+ import websockets
23
+ import aiohttp
24
+ import asyncio
25
+ from pydantic import Field
26
+
27
+ # Suppress Pydantic warnings about attribute shadowing
28
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
+
34
+ def get_version():
35
+ try:
36
+ from pathlib import Path
37
+ package_path = os.path.dirname(__file__)
38
+ version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
+ if version_match:
41
+ return version_match.group(1).strip()
42
+ except Exception:
43
+ print("Failed to get version from __init__.py")
44
+ return None
45
+
46
+ version = get_version()
47
+
48
+ logger : logging.Logger = logging.getLogger("firecrawl")
49
+
50
+ T = TypeVar('T')
51
+
52
+ # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
+ # """Metadata for a Firecrawl document."""
54
+ # title: Optional[str] = None
55
+ # description: Optional[str] = None
56
+ # language: Optional[str] = None
57
+ # keywords: Optional[str] = None
58
+ # robots: Optional[str] = None
59
+ # ogTitle: Optional[str] = None
60
+ # ogDescription: Optional[str] = None
61
+ # ogUrl: Optional[str] = None
62
+ # ogImage: Optional[str] = None
63
+ # ogAudio: Optional[str] = None
64
+ # ogDeterminer: Optional[str] = None
65
+ # ogLocale: Optional[str] = None
66
+ # ogLocaleAlternate: Optional[List[str]] = None
67
+ # ogSiteName: Optional[str] = None
68
+ # ogVideo: Optional[str] = None
69
+ # dctermsCreated: Optional[str] = None
70
+ # dcDateCreated: Optional[str] = None
71
+ # dcDate: Optional[str] = None
72
+ # dctermsType: Optional[str] = None
73
+ # dcType: Optional[str] = None
74
+ # dctermsAudience: Optional[str] = None
75
+ # dctermsSubject: Optional[str] = None
76
+ # dcSubject: Optional[str] = None
77
+ # dcDescription: Optional[str] = None
78
+ # dctermsKeywords: Optional[str] = None
79
+ # modifiedTime: Optional[str] = None
80
+ # publishedTime: Optional[str] = None
81
+ # articleTag: Optional[str] = None
82
+ # articleSection: Optional[str] = None
83
+ # sourceURL: Optional[str] = None
84
+ # statusCode: Optional[int] = None
85
+ # error: Optional[str] = None
86
+
87
+ class AgentOptions(pydantic.BaseModel):
88
+ """Configuration for the agent."""
89
+ model: Literal["FIRE-1"] = "FIRE-1"
90
+ prompt: Optional[str] = None
91
+
92
+ class AgentOptionsExtract(pydantic.BaseModel):
93
+ """Configuration for the agent in extract operations."""
94
+ model: Literal["FIRE-1"] = "FIRE-1"
95
+
96
+ class ActionsResult(pydantic.BaseModel):
97
+ """Result of actions performed during scraping."""
98
+ screenshots: List[str]
99
+
100
+ class ChangeTrackingData(pydantic.BaseModel):
101
+ """
102
+ Data for the change tracking format.
103
+ """
104
+ previousScrapeAt: Optional[str] = None
105
+ changeStatus: str # "new" | "same" | "changed" | "removed"
106
+ visibility: str # "visible" | "hidden"
107
+ diff: Optional[Dict[str, Any]] = None
108
+ json: Optional[Any] = None
109
+
110
+ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
+ """Document retrieved or processed by Firecrawl."""
112
+ url: Optional[str] = None
113
+ markdown: Optional[str] = None
114
+ html: Optional[str] = None
115
+ rawHtml: Optional[str] = None
116
+ links: Optional[List[str]] = None
117
+ extract: Optional[T] = None
118
+ json: Optional[T] = None
119
+ screenshot: Optional[str] = None
120
+ metadata: Optional[Any] = None
121
+ actions: Optional[ActionsResult] = None
122
+ title: Optional[str] = None # v1 search only
123
+ description: Optional[str] = None # v1 search only
124
+ changeTracking: Optional[ChangeTrackingData] = None
125
+
126
+ class LocationConfig(pydantic.BaseModel):
127
+ """Location configuration for scraping."""
128
+ country: Optional[str] = None
129
+ languages: Optional[List[str]] = None
130
+
131
+ class WebhookConfig(pydantic.BaseModel):
132
+ """Configuration for webhooks."""
133
+ url: str
134
+ headers: Optional[Dict[str, str]] = None
135
+ metadata: Optional[Dict[str, str]] = None
136
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
+
138
+ class ChangeTrackingOptions(pydantic.BaseModel):
139
+ """Configuration for change tracking."""
140
+ modes: Optional[List[Literal["git-diff", "json"]]] = None
141
+ schema: Optional[Any] = None
142
+ prompt: Optional[str] = None
143
+
144
+ class ScrapeOptions(pydantic.BaseModel):
145
+ """Parameters for scraping operations."""
146
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
147
+ headers: Optional[Dict[str, str]] = None
148
+ includeTags: Optional[List[str]] = None
149
+ excludeTags: Optional[List[str]] = None
150
+ onlyMainContent: Optional[bool] = None
151
+ waitFor: Optional[int] = None
152
+ timeout: Optional[int] = None
153
+ location: Optional[LocationConfig] = None
154
+ mobile: Optional[bool] = None
155
+ skipTlsVerification: Optional[bool] = None
156
+ removeBase64Images: Optional[bool] = None
157
+ blockAds: Optional[bool] = None
158
+ proxy: Optional[Literal["basic", "stealth"]] = None
159
+ changeTrackingOptions: Optional[ChangeTrackingOptions] = None
160
+
161
+ class WaitAction(pydantic.BaseModel):
162
+ """Wait action to perform during scraping."""
163
+ type: Literal["wait"]
164
+ milliseconds: int
165
+ selector: Optional[str] = None
166
+
167
+ class ScreenshotAction(pydantic.BaseModel):
168
+ """Screenshot action to perform during scraping."""
169
+ type: Literal["screenshot"]
170
+ fullPage: Optional[bool] = None
171
+
172
+ class ClickAction(pydantic.BaseModel):
173
+ """Click action to perform during scraping."""
174
+ type: Literal["click"]
175
+ selector: str
176
+
177
+ class WriteAction(pydantic.BaseModel):
178
+ """Write action to perform during scraping."""
179
+ type: Literal["write"]
180
+ text: str
181
+
182
+ class PressAction(pydantic.BaseModel):
183
+ """Press action to perform during scraping."""
184
+ type: Literal["press"]
185
+ key: str
186
+
187
+ class ScrollAction(pydantic.BaseModel):
188
+ """Scroll action to perform during scraping."""
189
+ type: Literal["scroll"]
190
+ direction: Literal["up", "down"]
191
+ selector: Optional[str] = None
192
+
193
+ class ScrapeAction(pydantic.BaseModel):
194
+ """Scrape action to perform during scraping."""
195
+ type: Literal["scrape"]
196
+
197
+ class ExecuteJavascriptAction(pydantic.BaseModel):
198
+ """Execute javascript action to perform during scraping."""
199
+ type: Literal["executeJavascript"]
200
+ script: str
201
+
202
+
203
+ class ExtractAgent(pydantic.BaseModel):
204
+ """Configuration for the agent in extract operations."""
205
+ model: Literal["FIRE-1"] = "FIRE-1"
206
+
207
+ class JsonConfig(pydantic.BaseModel):
208
+ """Configuration for extraction."""
209
+ prompt: Optional[str] = None
210
+ schema: Optional[Any] = None
211
+ systemPrompt: Optional[str] = None
212
+ agent: Optional[ExtractAgent] = None
213
+
214
+ class ScrapeParams(ScrapeOptions):
215
+ """Parameters for scraping operations."""
216
+ extract: Optional[JsonConfig] = None
217
+ jsonOptions: Optional[JsonConfig] = None
218
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
219
+ agent: Optional[AgentOptions] = None
220
+ webhook: Optional[WebhookConfig] = None
221
+
222
+ class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
223
+ """Response from scraping operations."""
224
+ success: bool = True
225
+ warning: Optional[str] = None
226
+ error: Optional[str] = None
227
+
228
+ class BatchScrapeResponse(pydantic.BaseModel):
229
+ """Response from batch scrape operations."""
230
+ id: Optional[str] = None
231
+ url: Optional[str] = None
232
+ success: bool = True
233
+ error: Optional[str] = None
234
+ invalidURLs: Optional[List[str]] = None
235
+
236
+ class BatchScrapeStatusResponse(pydantic.BaseModel):
237
+ """Response from batch scrape status checks."""
238
+ success: bool = True
239
+ status: Literal["scraping", "completed", "failed", "cancelled"]
240
+ completed: int
241
+ total: int
242
+ creditsUsed: int
243
+ expiresAt: datetime
244
+ next: Optional[str] = None
245
+ data: List[FirecrawlDocument]
246
+
247
+ class CrawlParams(pydantic.BaseModel):
248
+ """Parameters for crawling operations."""
249
+ includePaths: Optional[List[str]] = None
250
+ excludePaths: Optional[List[str]] = None
251
+ maxDepth: Optional[int] = None
252
+ maxDiscoveryDepth: Optional[int] = None
253
+ limit: Optional[int] = None
254
+ allowBackwardLinks: Optional[bool] = None
255
+ allowExternalLinks: Optional[bool] = None
256
+ ignoreSitemap: Optional[bool] = None
257
+ scrapeOptions: Optional[ScrapeOptions] = None
258
+ webhook: Optional[Union[str, WebhookConfig]] = None
259
+ deduplicateSimilarURLs: Optional[bool] = None
260
+ ignoreQueryParameters: Optional[bool] = None
261
+ regexOnFullURL: Optional[bool] = None
262
+
263
+ class CrawlResponse(pydantic.BaseModel):
264
+ """Response from crawling operations."""
265
+ id: Optional[str] = None
266
+ url: Optional[str] = None
267
+ success: bool = True
268
+ error: Optional[str] = None
269
+
270
+ class CrawlStatusResponse(pydantic.BaseModel):
271
+ """Response from crawl status checks."""
272
+ success: bool = True
273
+ status: Literal["scraping", "completed", "failed", "cancelled"]
274
+ completed: int
275
+ total: int
276
+ creditsUsed: int
277
+ expiresAt: datetime
278
+ next: Optional[str] = None
279
+ data: List[FirecrawlDocument]
280
+
281
+ class CrawlErrorsResponse(pydantic.BaseModel):
282
+ """Response from crawl/batch scrape error monitoring."""
283
+ errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
284
+ robotsBlocked: List[str]
285
+
286
+ class MapParams(pydantic.BaseModel):
287
+ """Parameters for mapping operations."""
288
+ search: Optional[str] = None
289
+ ignoreSitemap: Optional[bool] = None
290
+ includeSubdomains: Optional[bool] = None
291
+ sitemapOnly: Optional[bool] = None
292
+ limit: Optional[int] = None
293
+ timeout: Optional[int] = None
294
+
295
+ class MapResponse(pydantic.BaseModel):
296
+ """Response from mapping operations."""
297
+ success: bool = True
298
+ links: Optional[List[str]] = None
299
+ error: Optional[str] = None
300
+
301
+ class ExtractParams(pydantic.BaseModel):
302
+ """Parameters for extracting information from URLs."""
303
+ prompt: Optional[str] = None
304
+ schema: Optional[Any] = None
305
+ systemPrompt: Optional[str] = None
306
+ allowExternalLinks: Optional[bool] = None
307
+ enableWebSearch: Optional[bool] = None
308
+ includeSubdomains: Optional[bool] = None
309
+ origin: Optional[str] = None
310
+ showSources: Optional[bool] = None
311
+ scrapeOptions: Optional[ScrapeOptions] = None
312
+
313
+ class ExtractResponse(pydantic.BaseModel, Generic[T]):
314
+ """Response from extract operations."""
315
+ id: Optional[str] = None
316
+ status: Optional[Literal["processing", "completed", "failed"]] = None
317
+ expiresAt: Optional[datetime] = None
318
+ success: bool = True
319
+ data: Optional[T] = None
320
+ error: Optional[str] = None
321
+ warning: Optional[str] = None
322
+ sources: Optional[List[str]] = None
323
+
324
+ class SearchParams(pydantic.BaseModel):
325
+ query: str
326
+ limit: Optional[int] = 5
327
+ tbs: Optional[str] = None
328
+ filter: Optional[str] = None
329
+ lang: Optional[str] = "en"
330
+ country: Optional[str] = "us"
331
+ location: Optional[str] = None
332
+ origin: Optional[str] = "api"
333
+ timeout: Optional[int] = 60000
334
+ scrapeOptions: Optional[ScrapeOptions] = None
335
+
336
+ class SearchResponse(pydantic.BaseModel):
337
+ """Response from search operations."""
338
+ success: bool = True
339
+ data: List[FirecrawlDocument]
340
+ warning: Optional[str] = None
341
+ error: Optional[str] = None
342
+
343
+ class GenerateLLMsTextParams(pydantic.BaseModel):
344
+ """
345
+ Parameters for the LLMs.txt generation operation.
346
+ """
347
+ maxUrls: Optional[int] = 10
348
+ showFullText: Optional[bool] = False
349
+ __experimental_stream: Optional[bool] = None
350
+
351
+ class DeepResearchParams(pydantic.BaseModel):
352
+ """
353
+ Parameters for the deep research operation.
354
+ """
355
+ maxDepth: Optional[int] = 7
356
+ timeLimit: Optional[int] = 270
357
+ maxUrls: Optional[int] = 20
358
+ analysisPrompt: Optional[str] = None
359
+ systemPrompt: Optional[str] = None
360
+ __experimental_streamSteps: Optional[bool] = None
361
+
362
+ class DeepResearchResponse(pydantic.BaseModel):
363
+ """
364
+ Response from the deep research operation.
365
+ """
366
+ success: bool
367
+ id: str
368
+ error: Optional[str] = None
369
+
370
+ class DeepResearchStatusResponse(pydantic.BaseModel):
371
+ """
372
+ Status response from the deep research operation.
373
+ """
374
+ success: bool
375
+ data: Optional[Dict[str, Any]] = None
376
+ status: str
377
+ error: Optional[str] = None
378
+ expiresAt: str
379
+ currentDepth: int
380
+ maxDepth: int
381
+ activities: List[Dict[str, Any]]
382
+ sources: List[Dict[str, Any]]
383
+ summaries: List[str]
384
+
385
+ class GenerateLLMsTextResponse(pydantic.BaseModel):
386
+ """Response from LLMs.txt generation operations."""
387
+ success: bool = True
388
+ id: str
389
+ error: Optional[str] = None
390
+
391
+ class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
392
+ llmstxt: str
393
+ llmsfulltxt: Optional[str] = None
394
+
395
+ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
396
+ """Status response from LLMs.txt generation operations."""
397
+ success: bool = True
398
+ data: Optional[GenerateLLMsTextStatusResponseData] = None
399
+ status: Literal["processing", "completed", "failed"]
400
+ error: Optional[str] = None
401
+ expiresAt: str
402
+
403
+ class SearchResponse(pydantic.BaseModel):
404
+ """
405
+ Response from the search operation.
406
+ """
407
+ success: bool
408
+ data: List[Dict[str, Any]]
409
+ warning: Optional[str] = None
410
+ error: Optional[str] = None
411
+
412
+ class ExtractParams(pydantic.BaseModel):
413
+ """
414
+ Parameters for the extract operation.
415
+ """
416
+ prompt: Optional[str] = None
417
+ schema: Optional[Any] = pydantic.Field(None, alias='schema')
418
+ system_prompt: Optional[str] = None
419
+ allow_external_links: Optional[bool] = False
420
+ enable_web_search: Optional[bool] = False
421
+ # Just for backwards compatibility
422
+ enableWebSearch: Optional[bool] = False
423
+ show_sources: Optional[bool] = False
424
+ agent: Optional[Dict[str, Any]] = None
425
+
426
+ class FirecrawlApp:
427
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
428
+ """
429
+ Initialize the FirecrawlApp instance with API key, API URL.
430
+
431
+ Args:
432
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
433
+ api_url (Optional[str]): Base URL for the Firecrawl API.
434
+ """
435
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
436
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
437
+
438
+ # Only require API key when using cloud service
439
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
440
+ logger.warning("No API key provided for cloud service")
441
+ raise ValueError('No API key provided')
442
+
443
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
444
+
445
+ def scrape_url(
446
+ self,
447
+ url: str,
448
+ *,
449
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
450
+ include_tags: Optional[List[str]] = None,
451
+ exclude_tags: Optional[List[str]] = None,
452
+ only_main_content: Optional[bool] = None,
453
+ wait_for: Optional[int] = None,
454
+ timeout: Optional[int] = None,
455
+ location: Optional[LocationConfig] = None,
456
+ mobile: Optional[bool] = None,
457
+ skip_tls_verification: Optional[bool] = None,
458
+ remove_base64_images: Optional[bool] = None,
459
+ block_ads: Optional[bool] = None,
460
+ proxy: Optional[Literal["basic", "stealth"]] = None,
461
+ extract: Optional[JsonConfig] = None,
462
+ json_options: Optional[JsonConfig] = None,
463
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
464
+ change_tracking_options: Optional[ChangeTrackingOptions] = None,
465
+ **kwargs) -> ScrapeResponse[Any]:
466
+ """
467
+ Scrape and extract content from a URL.
468
+
469
+ Args:
470
+ url (str): Target URL to scrape
471
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
472
+ include_tags (Optional[List[str]]): HTML tags to include
473
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
474
+ only_main_content (Optional[bool]): Extract main content only
475
+ wait_for (Optional[int]): Wait for a specific element to appear
476
+ timeout (Optional[int]): Request timeout (ms)
477
+ location (Optional[LocationConfig]): Location configuration
478
+ mobile (Optional[bool]): Use mobile user agent
479
+ skip_tls_verification (Optional[bool]): Skip TLS verification
480
+ remove_base64_images (Optional[bool]): Remove base64 images
481
+ block_ads (Optional[bool]): Block ads
482
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
483
+ extract (Optional[JsonConfig]): Content extraction settings
484
+ json_options (Optional[JsonConfig]): JSON extraction settings
485
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
486
+ change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
487
+
488
+
489
+ Returns:
490
+ ScrapeResponse with:
491
+ * Requested content formats
492
+ * Page metadata
493
+ * Extraction results
494
+ * Success/error status
495
+
496
+ Raises:
497
+ Exception: If scraping fails
498
+ """
499
+ headers = self._prepare_headers()
500
+
501
+ # Build scrape parameters
502
+ scrape_params = {
503
+ 'url': url,
504
+ 'origin': f"python-sdk@{version}"
505
+ }
506
+
507
+ # Add optional parameters if provided
508
+ if formats:
509
+ scrape_params['formats'] = formats
510
+ if include_tags:
511
+ scrape_params['includeTags'] = include_tags
512
+ if exclude_tags:
513
+ scrape_params['excludeTags'] = exclude_tags
514
+ if only_main_content is not None:
515
+ scrape_params['onlyMainContent'] = only_main_content
516
+ if wait_for:
517
+ scrape_params['waitFor'] = wait_for
518
+ if timeout:
519
+ scrape_params['timeout'] = timeout
520
+ if location:
521
+ scrape_params['location'] = location.dict(exclude_none=True)
522
+ if mobile is not None:
523
+ scrape_params['mobile'] = mobile
524
+ if skip_tls_verification is not None:
525
+ scrape_params['skipTlsVerification'] = skip_tls_verification
526
+ if remove_base64_images is not None:
527
+ scrape_params['removeBase64Images'] = remove_base64_images
528
+ if block_ads is not None:
529
+ scrape_params['blockAds'] = block_ads
530
+ if proxy:
531
+ scrape_params['proxy'] = proxy
532
+ if extract is not None:
533
+ extract = self._ensure_schema_dict(extract)
534
+ if isinstance(extract, dict) and "schema" in extract:
535
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
536
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
537
+ if json_options is not None:
538
+ json_options = self._ensure_schema_dict(json_options)
539
+ if isinstance(json_options, dict) and "schema" in json_options:
540
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
541
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
542
+ if actions:
543
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
544
+ if change_tracking_options:
545
+ scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
546
+
547
+ scrape_params.update(kwargs)
548
+
549
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
550
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
551
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
552
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
553
+
554
+ # Make request
555
+ response = requests.post(
556
+ f'{self.api_url}/v1/scrape',
557
+ headers=headers,
558
+ json=scrape_params,
559
+ timeout=(timeout + 5000 if timeout else None)
560
+ )
561
+
562
+ if response.status_code == 200:
563
+ try:
564
+ response_json = response.json()
565
+ if response_json.get('success') and 'data' in response_json:
566
+ return ScrapeResponse(**response_json['data'])
567
+ elif "error" in response_json:
568
+ raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
569
+ else:
570
+ raise Exception(f'Failed to scrape URL. Error: {response_json}')
571
+ except ValueError:
572
+ raise Exception('Failed to parse Firecrawl response as JSON.')
573
+ else:
574
+ self._handle_error(response, 'scrape URL')
575
+
576
+ def search(
577
+ self,
578
+ query: str,
579
+ *,
580
+ limit: Optional[int] = None,
581
+ tbs: Optional[str] = None,
582
+ filter: Optional[str] = None,
583
+ lang: Optional[str] = None,
584
+ country: Optional[str] = None,
585
+ location: Optional[str] = None,
586
+ timeout: Optional[int] = None,
587
+ scrape_options: Optional[ScrapeOptions] = None,
588
+ **kwargs) -> SearchResponse:
589
+ """
590
+ Search for content using Firecrawl.
591
+
592
+ Args:
593
+ query (str): Search query string
594
+ limit (Optional[int]): Max results (default: 5)
595
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
596
+ filter (Optional[str]): Custom result filter
597
+ lang (Optional[str]): Language code (default: "en")
598
+ country (Optional[str]): Country code (default: "us")
599
+ location (Optional[str]): Geo-targeting
600
+ timeout (Optional[int]): Request timeout in milliseconds
601
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
602
+ **kwargs: Additional keyword arguments for future compatibility
603
+
604
+ Returns:
605
+ SearchResponse: Response containing:
606
+ * success (bool): Whether request succeeded
607
+ * data (List[FirecrawlDocument]): Search results
608
+ * warning (Optional[str]): Warning message if any
609
+ * error (Optional[str]): Error message if any
610
+
611
+ Raises:
612
+ Exception: If search fails or response cannot be parsed
613
+ """
614
+ # Validate any additional kwargs
615
+ self._validate_kwargs(kwargs, "search")
616
+
617
+ # Build search parameters
618
+ search_params = {}
619
+
620
+ # Add individual parameters
621
+ if limit is not None:
622
+ search_params['limit'] = limit
623
+ if tbs is not None:
624
+ search_params['tbs'] = tbs
625
+ if filter is not None:
626
+ search_params['filter'] = filter
627
+ if lang is not None:
628
+ search_params['lang'] = lang
629
+ if country is not None:
630
+ search_params['country'] = country
631
+ if location is not None:
632
+ search_params['location'] = location
633
+ if timeout is not None:
634
+ search_params['timeout'] = timeout
635
+ if scrape_options is not None:
636
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
637
+
638
+ # Add any additional kwargs
639
+ search_params.update(kwargs)
640
+
641
+ # Create final params object
642
+ final_params = SearchParams(query=query, **search_params)
643
+ params_dict = final_params.dict(exclude_none=True)
644
+ params_dict['origin'] = f"python-sdk@{version}"
645
+
646
+ # Make request
647
+ response = requests.post(
648
+ f"{self.api_url}/v1/search",
649
+ headers={"Authorization": f"Bearer {self.api_key}"},
650
+ json=params_dict
651
+ )
652
+
653
+ if response.status_code == 200:
654
+ try:
655
+ response_json = response.json()
656
+ if response_json.get('success') and 'data' in response_json:
657
+ return SearchResponse(**response_json)
658
+ elif "error" in response_json:
659
+ raise Exception(f'Search failed. Error: {response_json["error"]}')
660
+ else:
661
+ raise Exception(f'Search failed. Error: {response_json}')
662
+ except ValueError:
663
+ raise Exception('Failed to parse Firecrawl response as JSON.')
664
+ else:
665
+ self._handle_error(response, 'search')
666
+
667
+ def crawl_url(
668
+ self,
669
+ url: str,
670
+ *,
671
+ include_paths: Optional[List[str]] = None,
672
+ exclude_paths: Optional[List[str]] = None,
673
+ max_depth: Optional[int] = None,
674
+ max_discovery_depth: Optional[int] = None,
675
+ limit: Optional[int] = None,
676
+ allow_backward_links: Optional[bool] = None,
677
+ allow_external_links: Optional[bool] = None,
678
+ ignore_sitemap: Optional[bool] = None,
679
+ scrape_options: Optional[ScrapeOptions] = None,
680
+ webhook: Optional[Union[str, WebhookConfig]] = None,
681
+ deduplicate_similar_urls: Optional[bool] = None,
682
+ ignore_query_parameters: Optional[bool] = None,
683
+ regex_on_full_url: Optional[bool] = None,
684
+ poll_interval: Optional[int] = 2,
685
+ idempotency_key: Optional[str] = None,
686
+ **kwargs
687
+ ) -> CrawlStatusResponse:
688
+ """
689
+ Crawl a website starting from a URL.
690
+
691
+ Args:
692
+ url (str): Target URL to start crawling from
693
+ include_paths (Optional[List[str]]): Patterns of URLs to include
694
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
695
+ max_depth (Optional[int]): Maximum crawl depth
696
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
697
+ limit (Optional[int]): Maximum pages to crawl
698
+ allow_backward_links (Optional[bool]): Follow parent directory links
699
+ allow_external_links (Optional[bool]): Follow external domain links
700
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
701
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
702
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
703
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
704
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
705
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
706
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
707
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
708
+ **kwargs: Additional parameters to pass to the API
709
+
710
+ Returns:
711
+ CrawlStatusResponse with:
712
+ * Crawling status and progress
713
+ * Crawled page contents
714
+ * Success/error information
715
+
716
+ Raises:
717
+ Exception: If crawl fails
718
+ """
719
+ # Validate any additional kwargs
720
+ self._validate_kwargs(kwargs, "crawl_url")
721
+
722
+ crawl_params = {}
723
+
724
+ # Add individual parameters
725
+ if include_paths is not None:
726
+ crawl_params['includePaths'] = include_paths
727
+ if exclude_paths is not None:
728
+ crawl_params['excludePaths'] = exclude_paths
729
+ if max_depth is not None:
730
+ crawl_params['maxDepth'] = max_depth
731
+ if max_discovery_depth is not None:
732
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
733
+ if limit is not None:
734
+ crawl_params['limit'] = limit
735
+ if allow_backward_links is not None:
736
+ crawl_params['allowBackwardLinks'] = allow_backward_links
737
+ if allow_external_links is not None:
738
+ crawl_params['allowExternalLinks'] = allow_external_links
739
+ if ignore_sitemap is not None:
740
+ crawl_params['ignoreSitemap'] = ignore_sitemap
741
+ if scrape_options is not None:
742
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
743
+ if webhook is not None:
744
+ crawl_params['webhook'] = webhook
745
+ if deduplicate_similar_urls is not None:
746
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
747
+ if ignore_query_parameters is not None:
748
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
749
+ if regex_on_full_url is not None:
750
+ crawl_params['regexOnFullURL'] = regex_on_full_url
751
+
752
+ # Add any additional kwargs
753
+ crawl_params.update(kwargs)
754
+
755
+ # Create final params object
756
+ final_params = CrawlParams(**crawl_params)
757
+ params_dict = final_params.dict(exclude_none=True)
758
+ params_dict['url'] = url
759
+ params_dict['origin'] = f"python-sdk@{version}"
760
+
761
+ # Make request
762
+ headers = self._prepare_headers(idempotency_key)
763
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
764
+
765
+ if response.status_code == 200:
766
+ try:
767
+ id = response.json().get('id')
768
+ except:
769
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
770
+ return self._monitor_job_status(id, headers, poll_interval)
771
+ else:
772
+ self._handle_error(response, 'start crawl job')
773
+
774
+ def async_crawl_url(
775
+ self,
776
+ url: str,
777
+ *,
778
+ include_paths: Optional[List[str]] = None,
779
+ exclude_paths: Optional[List[str]] = None,
780
+ max_depth: Optional[int] = None,
781
+ max_discovery_depth: Optional[int] = None,
782
+ limit: Optional[int] = None,
783
+ allow_backward_links: Optional[bool] = None,
784
+ allow_external_links: Optional[bool] = None,
785
+ ignore_sitemap: Optional[bool] = None,
786
+ scrape_options: Optional[ScrapeOptions] = None,
787
+ webhook: Optional[Union[str, WebhookConfig]] = None,
788
+ deduplicate_similar_urls: Optional[bool] = None,
789
+ ignore_query_parameters: Optional[bool] = None,
790
+ regex_on_full_url: Optional[bool] = None,
791
+ idempotency_key: Optional[str] = None,
792
+ **kwargs
793
+ ) -> CrawlResponse:
794
+ """
795
+ Start an asynchronous crawl job.
796
+
797
+ Args:
798
+ url (str): Target URL to start crawling from
799
+ include_paths (Optional[List[str]]): Patterns of URLs to include
800
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
801
+ max_depth (Optional[int]): Maximum crawl depth
802
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
803
+ limit (Optional[int]): Maximum pages to crawl
804
+ allow_backward_links (Optional[bool]): Follow parent directory links
805
+ allow_external_links (Optional[bool]): Follow external domain links
806
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
807
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
808
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
809
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
810
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
811
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
812
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
813
+ **kwargs: Additional parameters to pass to the API
814
+
815
+ Returns:
816
+ CrawlResponse with:
817
+ * success - Whether crawl started successfully
818
+ * id - Unique identifier for the crawl job
819
+ * url - Status check URL for the crawl
820
+ * error - Error message if start failed
821
+
822
+ Raises:
823
+ Exception: If crawl initiation fails
824
+ """
825
+ # Validate any additional kwargs
826
+ self._validate_kwargs(kwargs, "async_crawl_url")
827
+
828
+ crawl_params = {}
829
+
830
+ # Add individual parameters
831
+ if include_paths is not None:
832
+ crawl_params['includePaths'] = include_paths
833
+ if exclude_paths is not None:
834
+ crawl_params['excludePaths'] = exclude_paths
835
+ if max_depth is not None:
836
+ crawl_params['maxDepth'] = max_depth
837
+ if max_discovery_depth is not None:
838
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
839
+ if limit is not None:
840
+ crawl_params['limit'] = limit
841
+ if allow_backward_links is not None:
842
+ crawl_params['allowBackwardLinks'] = allow_backward_links
843
+ if allow_external_links is not None:
844
+ crawl_params['allowExternalLinks'] = allow_external_links
845
+ if ignore_sitemap is not None:
846
+ crawl_params['ignoreSitemap'] = ignore_sitemap
847
+ if scrape_options is not None:
848
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
849
+ if webhook is not None:
850
+ crawl_params['webhook'] = webhook
851
+ if deduplicate_similar_urls is not None:
852
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
853
+ if ignore_query_parameters is not None:
854
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
855
+ if regex_on_full_url is not None:
856
+ crawl_params['regexOnFullURL'] = regex_on_full_url
857
+
858
+ # Add any additional kwargs
859
+ crawl_params.update(kwargs)
860
+
861
+ # Create final params object
862
+ final_params = CrawlParams(**crawl_params)
863
+ params_dict = final_params.dict(exclude_none=True)
864
+ params_dict['url'] = url
865
+ params_dict['origin'] = f"python-sdk@{version}"
866
+
867
+ # Make request
868
+ headers = self._prepare_headers(idempotency_key)
869
+ response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
870
+
871
+ if response.status_code == 200:
872
+ try:
873
+ return CrawlResponse(**response.json())
874
+ except:
875
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
876
+ else:
877
+ self._handle_error(response, 'start crawl job')
878
+
879
+ def check_crawl_status(self, id: str) -> CrawlStatusResponse:
880
+ """
881
+ Check the status and results of a crawl job.
882
+
883
+ Args:
884
+ id: Unique identifier for the crawl job
885
+
886
+ Returns:
887
+ CrawlStatusResponse containing:
888
+
889
+ Status Information:
890
+ * status - Current state (scraping/completed/failed/cancelled)
891
+ * completed - Number of pages crawled
892
+ * total - Total pages to crawl
893
+ * creditsUsed - API credits consumed
894
+ * expiresAt - Data expiration timestamp
895
+
896
+ Results:
897
+ * data - List of crawled documents
898
+ * next - URL for next page of results (if paginated)
899
+ * success - Whether status check succeeded
900
+ * error - Error message if failed
901
+
902
+ Raises:
903
+ Exception: If status check fails
904
+ """
905
+ endpoint = f'/v1/crawl/{id}'
906
+
907
+ headers = self._prepare_headers()
908
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
909
+ if response.status_code == 200:
910
+ try:
911
+ status_data = response.json()
912
+ except:
913
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
914
+ if status_data['status'] == 'completed':
915
+ if 'data' in status_data:
916
+ data = status_data['data']
917
+ while 'next' in status_data:
918
+ if len(status_data['data']) == 0:
919
+ break
920
+ next_url = status_data.get('next')
921
+ if not next_url:
922
+ logger.warning("Expected 'next' URL is missing.")
923
+ break
924
+ try:
925
+ status_response = self._get_request(next_url, headers)
926
+ if status_response.status_code != 200:
927
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
928
+ break
929
+ try:
930
+ next_data = status_response.json()
931
+ except:
932
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
933
+ data.extend(next_data.get('data', []))
934
+ status_data = next_data
935
+ except Exception as e:
936
+ logger.error(f"Error during pagination request: {e}")
937
+ break
938
+ status_data['data'] = data
939
+
940
+ response = {
941
+ 'status': status_data.get('status'),
942
+ 'total': status_data.get('total'),
943
+ 'completed': status_data.get('completed'),
944
+ 'creditsUsed': status_data.get('creditsUsed'),
945
+ 'expiresAt': status_data.get('expiresAt'),
946
+ 'data': status_data.get('data')
947
+ }
948
+
949
+ if 'error' in status_data:
950
+ response['error'] = status_data['error']
951
+
952
+ if 'next' in status_data:
953
+ response['next'] = status_data['next']
954
+
955
+ return CrawlStatusResponse(
956
+ success=False if 'error' in status_data else True,
957
+ **response
958
+ )
959
+ else:
960
+ self._handle_error(response, 'check crawl status')
961
+
962
+ def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
963
+ """
964
+ Returns information about crawl errors.
965
+
966
+ Args:
967
+ id (str): The ID of the crawl job
968
+
969
+ Returns:
970
+ CrawlErrorsResponse containing:
971
+ * errors (List[Dict[str, str]]): List of errors with fields:
972
+ - id (str): Error ID
973
+ - timestamp (str): When the error occurred
974
+ - url (str): URL that caused the error
975
+ - error (str): Error message
976
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
977
+
978
+ Raises:
979
+ Exception: If error check fails
980
+ """
981
+ headers = self._prepare_headers()
982
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
983
+ if response.status_code == 200:
984
+ try:
985
+ return CrawlErrorsResponse(**response.json())
986
+ except:
987
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
988
+ else:
989
+ self._handle_error(response, "check crawl errors")
990
+
991
+ def cancel_crawl(self, id: str) -> Dict[str, Any]:
992
+ """
993
+ Cancel an asynchronous crawl job.
994
+
995
+ Args:
996
+ id (str): The ID of the crawl job to cancel
997
+
998
+ Returns:
999
+ Dict[str, Any] containing:
1000
+ * success (bool): Whether cancellation was successful
1001
+ * error (str, optional): Error message if cancellation failed
1002
+
1003
+ Raises:
1004
+ Exception: If cancellation fails
1005
+ """
1006
+ headers = self._prepare_headers()
1007
+ response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1008
+ if response.status_code == 200:
1009
+ try:
1010
+ return response.json()
1011
+ except:
1012
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1013
+ else:
1014
+ self._handle_error(response, "cancel crawl job")
1015
+
1016
+ def crawl_url_and_watch(
1017
+ self,
1018
+ url: str,
1019
+ *,
1020
+ include_paths: Optional[List[str]] = None,
1021
+ exclude_paths: Optional[List[str]] = None,
1022
+ max_depth: Optional[int] = None,
1023
+ max_discovery_depth: Optional[int] = None,
1024
+ limit: Optional[int] = None,
1025
+ allow_backward_links: Optional[bool] = None,
1026
+ allow_external_links: Optional[bool] = None,
1027
+ ignore_sitemap: Optional[bool] = None,
1028
+ scrape_options: Optional[ScrapeOptions] = None,
1029
+ webhook: Optional[Union[str, WebhookConfig]] = None,
1030
+ deduplicate_similar_urls: Optional[bool] = None,
1031
+ ignore_query_parameters: Optional[bool] = None,
1032
+ regex_on_full_url: Optional[bool] = None,
1033
+ idempotency_key: Optional[str] = None,
1034
+ **kwargs
1035
+ ) -> 'CrawlWatcher':
1036
+ """
1037
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1038
+
1039
+ Args:
1040
+ url (str): Target URL to start crawling from
1041
+ include_paths (Optional[List[str]]): Patterns of URLs to include
1042
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1043
+ max_depth (Optional[int]): Maximum crawl depth
1044
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1045
+ limit (Optional[int]): Maximum pages to crawl
1046
+ allow_backward_links (Optional[bool]): Follow parent directory links
1047
+ allow_external_links (Optional[bool]): Follow external domain links
1048
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1049
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1050
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1051
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1052
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
1053
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
1054
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1055
+ **kwargs: Additional parameters to pass to the API
1056
+
1057
+ Returns:
1058
+ CrawlWatcher: An instance to monitor the crawl job via WebSocket
1059
+
1060
+ Raises:
1061
+ Exception: If crawl job fails to start
1062
+ """
1063
+ crawl_response = self.async_crawl_url(
1064
+ url,
1065
+ include_paths=include_paths,
1066
+ exclude_paths=exclude_paths,
1067
+ max_depth=max_depth,
1068
+ max_discovery_depth=max_discovery_depth,
1069
+ limit=limit,
1070
+ allow_backward_links=allow_backward_links,
1071
+ allow_external_links=allow_external_links,
1072
+ ignore_sitemap=ignore_sitemap,
1073
+ scrape_options=scrape_options,
1074
+ webhook=webhook,
1075
+ deduplicate_similar_urls=deduplicate_similar_urls,
1076
+ ignore_query_parameters=ignore_query_parameters,
1077
+ regex_on_full_url=regex_on_full_url,
1078
+ idempotency_key=idempotency_key,
1079
+ **kwargs
1080
+ )
1081
+ if crawl_response.success and crawl_response.id:
1082
+ return CrawlWatcher(crawl_response.id, self)
1083
+ else:
1084
+ raise Exception("Crawl job failed to start")
1085
+
1086
+ def map_url(
1087
+ self,
1088
+ url: str,
1089
+ *,
1090
+ search: Optional[str] = None,
1091
+ ignore_sitemap: Optional[bool] = None,
1092
+ include_subdomains: Optional[bool] = None,
1093
+ sitemap_only: Optional[bool] = None,
1094
+ limit: Optional[int] = None,
1095
+ timeout: Optional[int] = None,
1096
+ **kwargs) -> MapResponse:
1097
+ """
1098
+ Map and discover links from a URL.
1099
+
1100
+ Args:
1101
+ url (str): Target URL to map
1102
+ search (Optional[str]): Filter pattern for URLs
1103
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1104
+ include_subdomains (Optional[bool]): Include subdomain links
1105
+ sitemap_only (Optional[bool]): Only use sitemap.xml
1106
+ limit (Optional[int]): Maximum URLs to return
1107
+ timeout (Optional[int]): Request timeout in milliseconds
1108
+ **kwargs: Additional parameters to pass to the API
1109
+
1110
+ Returns:
1111
+ MapResponse: Response containing:
1112
+ * success (bool): Whether request succeeded
1113
+ * links (List[str]): Discovered URLs
1114
+ * error (Optional[str]): Error message if any
1115
+
1116
+ Raises:
1117
+ Exception: If mapping fails or response cannot be parsed
1118
+ """
1119
+ # Validate any additional kwargs
1120
+ self._validate_kwargs(kwargs, "map_url")
1121
+
1122
+ # Build map parameters
1123
+ map_params = {}
1124
+
1125
+ # Add individual parameters
1126
+ if search is not None:
1127
+ map_params['search'] = search
1128
+ if ignore_sitemap is not None:
1129
+ map_params['ignoreSitemap'] = ignore_sitemap
1130
+ if include_subdomains is not None:
1131
+ map_params['includeSubdomains'] = include_subdomains
1132
+ if sitemap_only is not None:
1133
+ map_params['sitemapOnly'] = sitemap_only
1134
+ if limit is not None:
1135
+ map_params['limit'] = limit
1136
+ if timeout is not None:
1137
+ map_params['timeout'] = timeout
1138
+
1139
+ # Add any additional kwargs
1140
+ map_params.update(kwargs)
1141
+
1142
+ # Create final params object
1143
+ final_params = MapParams(**map_params)
1144
+ params_dict = final_params.dict(exclude_none=True)
1145
+ params_dict['url'] = url
1146
+ params_dict['origin'] = f"python-sdk@{version}"
1147
+
1148
+ # Make request
1149
+ response = requests.post(
1150
+ f"{self.api_url}/v1/map",
1151
+ headers={"Authorization": f"Bearer {self.api_key}"},
1152
+ json=params_dict
1153
+ )
1154
+
1155
+ if response.status_code == 200:
1156
+ try:
1157
+ response_json = response.json()
1158
+ if response_json.get('success') and 'links' in response_json:
1159
+ return MapResponse(**response_json)
1160
+ elif "error" in response_json:
1161
+ raise Exception(f'Map failed. Error: {response_json["error"]}')
1162
+ else:
1163
+ raise Exception(f'Map failed. Error: {response_json}')
1164
+ except ValueError:
1165
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1166
+ else:
1167
+ self._handle_error(response, 'map')
1168
+
1169
+ def batch_scrape_urls(
1170
+ self,
1171
+ urls: List[str],
1172
+ *,
1173
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1174
+ headers: Optional[Dict[str, str]] = None,
1175
+ include_tags: Optional[List[str]] = None,
1176
+ exclude_tags: Optional[List[str]] = None,
1177
+ only_main_content: Optional[bool] = None,
1178
+ wait_for: Optional[int] = None,
1179
+ timeout: Optional[int] = None,
1180
+ location: Optional[LocationConfig] = None,
1181
+ mobile: Optional[bool] = None,
1182
+ skip_tls_verification: Optional[bool] = None,
1183
+ remove_base64_images: Optional[bool] = None,
1184
+ block_ads: Optional[bool] = None,
1185
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1186
+ extract: Optional[JsonConfig] = None,
1187
+ json_options: Optional[JsonConfig] = None,
1188
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1189
+ agent: Optional[AgentOptions] = None,
1190
+ poll_interval: Optional[int] = 2,
1191
+ idempotency_key: Optional[str] = None,
1192
+ **kwargs
1193
+ ) -> BatchScrapeStatusResponse:
1194
+ """
1195
+ Batch scrape multiple URLs and monitor until completion.
1196
+
1197
+ Args:
1198
+ urls (List[str]): URLs to scrape
1199
+ formats (Optional[List[Literal]]): Content formats to retrieve
1200
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1201
+ include_tags (Optional[List[str]]): HTML tags to include
1202
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1203
+ only_main_content (Optional[bool]): Extract main content only
1204
+ wait_for (Optional[int]): Wait time in milliseconds
1205
+ timeout (Optional[int]): Request timeout in milliseconds
1206
+ location (Optional[LocationConfig]): Location configuration
1207
+ mobile (Optional[bool]): Use mobile user agent
1208
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1209
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1210
+ block_ads (Optional[bool]): Block advertisements
1211
+ proxy (Optional[Literal]): Proxy type to use
1212
+ extract (Optional[JsonConfig]): Content extraction config
1213
+ json_options (Optional[JsonConfig]): JSON extraction config
1214
+ actions (Optional[List[Union]]): Actions to perform
1215
+ agent (Optional[AgentOptions]): Agent configuration
1216
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
1217
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1218
+ **kwargs: Additional parameters to pass to the API
1219
+
1220
+ Returns:
1221
+ BatchScrapeStatusResponse with:
1222
+ * Scraping status and progress
1223
+ * Scraped content for each URL
1224
+ * Success/error information
1225
+
1226
+ Raises:
1227
+ Exception: If batch scrape fails
1228
+ """
1229
+ # Validate any additional kwargs
1230
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
1231
+
1232
+ scrape_params = {}
1233
+
1234
+ # Add individual parameters
1235
+ if formats is not None:
1236
+ scrape_params['formats'] = formats
1237
+ if headers is not None:
1238
+ scrape_params['headers'] = headers
1239
+ if include_tags is not None:
1240
+ scrape_params['includeTags'] = include_tags
1241
+ if exclude_tags is not None:
1242
+ scrape_params['excludeTags'] = exclude_tags
1243
+ if only_main_content is not None:
1244
+ scrape_params['onlyMainContent'] = only_main_content
1245
+ if wait_for is not None:
1246
+ scrape_params['waitFor'] = wait_for
1247
+ if timeout is not None:
1248
+ scrape_params['timeout'] = timeout
1249
+ if location is not None:
1250
+ scrape_params['location'] = location.dict(exclude_none=True)
1251
+ if mobile is not None:
1252
+ scrape_params['mobile'] = mobile
1253
+ if skip_tls_verification is not None:
1254
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1255
+ if remove_base64_images is not None:
1256
+ scrape_params['removeBase64Images'] = remove_base64_images
1257
+ if block_ads is not None:
1258
+ scrape_params['blockAds'] = block_ads
1259
+ if proxy is not None:
1260
+ scrape_params['proxy'] = proxy
1261
+ if extract is not None:
1262
+ extract = self._ensure_schema_dict(extract)
1263
+ if isinstance(extract, dict) and "schema" in extract:
1264
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1265
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1266
+ if json_options is not None:
1267
+ json_options = self._ensure_schema_dict(json_options)
1268
+ if isinstance(json_options, dict) and "schema" in json_options:
1269
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1270
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1271
+ if actions is not None:
1272
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1273
+ if agent is not None:
1274
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1275
+
1276
+ # Add any additional kwargs
1277
+ scrape_params.update(kwargs)
1278
+
1279
+ # Create final params object
1280
+ final_params = ScrapeParams(**scrape_params)
1281
+ params_dict = final_params.dict(exclude_none=True)
1282
+ params_dict['urls'] = urls
1283
+ params_dict['origin'] = f"python-sdk@{version}"
1284
+
1285
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1286
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1287
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1288
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1289
+
1290
+ # Make request
1291
+ headers = self._prepare_headers(idempotency_key)
1292
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1293
+
1294
+ if response.status_code == 200:
1295
+ try:
1296
+ id = response.json().get('id')
1297
+ except:
1298
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1299
+ return self._monitor_job_status(id, headers, poll_interval)
1300
+ else:
1301
+ self._handle_error(response, 'start batch scrape job')
1302
+
1303
+ def async_batch_scrape_urls(
1304
+ self,
1305
+ urls: List[str],
1306
+ *,
1307
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1308
+ headers: Optional[Dict[str, str]] = None,
1309
+ include_tags: Optional[List[str]] = None,
1310
+ exclude_tags: Optional[List[str]] = None,
1311
+ only_main_content: Optional[bool] = None,
1312
+ wait_for: Optional[int] = None,
1313
+ timeout: Optional[int] = None,
1314
+ location: Optional[LocationConfig] = None,
1315
+ mobile: Optional[bool] = None,
1316
+ skip_tls_verification: Optional[bool] = None,
1317
+ remove_base64_images: Optional[bool] = None,
1318
+ block_ads: Optional[bool] = None,
1319
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1320
+ extract: Optional[JsonConfig] = None,
1321
+ json_options: Optional[JsonConfig] = None,
1322
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1323
+ agent: Optional[AgentOptions] = None,
1324
+ idempotency_key: Optional[str] = None,
1325
+ **kwargs
1326
+ ) -> BatchScrapeResponse:
1327
+ """
1328
+ Initiate a batch scrape job asynchronously.
1329
+
1330
+ Args:
1331
+ urls (List[str]): URLs to scrape
1332
+ formats (Optional[List[Literal]]): Content formats to retrieve
1333
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1334
+ include_tags (Optional[List[str]]): HTML tags to include
1335
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1336
+ only_main_content (Optional[bool]): Extract main content only
1337
+ wait_for (Optional[int]): Wait time in milliseconds
1338
+ timeout (Optional[int]): Request timeout in milliseconds
1339
+ location (Optional[LocationConfig]): Location configuration
1340
+ mobile (Optional[bool]): Use mobile user agent
1341
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1342
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1343
+ block_ads (Optional[bool]): Block advertisements
1344
+ proxy (Optional[Literal]): Proxy type to use
1345
+ extract (Optional[JsonConfig]): Content extraction config
1346
+ json_options (Optional[JsonConfig]): JSON extraction config
1347
+ actions (Optional[List[Union]]): Actions to perform
1348
+ agent (Optional[AgentOptions]): Agent configuration
1349
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1350
+ **kwargs: Additional parameters to pass to the API
1351
+
1352
+ Returns:
1353
+ BatchScrapeResponse with:
1354
+ * success - Whether job started successfully
1355
+ * id - Unique identifier for the job
1356
+ * url - Status check URL
1357
+ * error - Error message if start failed
1358
+
1359
+ Raises:
1360
+ Exception: If job initiation fails
1361
+ """
1362
+ # Validate any additional kwargs
1363
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1364
+
1365
+ scrape_params = {}
1366
+
1367
+ # Add individual parameters
1368
+ if formats is not None:
1369
+ scrape_params['formats'] = formats
1370
+ if headers is not None:
1371
+ scrape_params['headers'] = headers
1372
+ if include_tags is not None:
1373
+ scrape_params['includeTags'] = include_tags
1374
+ if exclude_tags is not None:
1375
+ scrape_params['excludeTags'] = exclude_tags
1376
+ if only_main_content is not None:
1377
+ scrape_params['onlyMainContent'] = only_main_content
1378
+ if wait_for is not None:
1379
+ scrape_params['waitFor'] = wait_for
1380
+ if timeout is not None:
1381
+ scrape_params['timeout'] = timeout
1382
+ if location is not None:
1383
+ scrape_params['location'] = location.dict(exclude_none=True)
1384
+ if mobile is not None:
1385
+ scrape_params['mobile'] = mobile
1386
+ if skip_tls_verification is not None:
1387
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1388
+ if remove_base64_images is not None:
1389
+ scrape_params['removeBase64Images'] = remove_base64_images
1390
+ if block_ads is not None:
1391
+ scrape_params['blockAds'] = block_ads
1392
+ if proxy is not None:
1393
+ scrape_params['proxy'] = proxy
1394
+ if extract is not None:
1395
+ extract = self._ensure_schema_dict(extract)
1396
+ if isinstance(extract, dict) and "schema" in extract:
1397
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1398
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1399
+ if json_options is not None:
1400
+ json_options = self._ensure_schema_dict(json_options)
1401
+ if isinstance(json_options, dict) and "schema" in json_options:
1402
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1403
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1404
+ if actions is not None:
1405
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1406
+ if agent is not None:
1407
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1408
+
1409
+ # Add any additional kwargs
1410
+ scrape_params.update(kwargs)
1411
+
1412
+ # Create final params object
1413
+ final_params = ScrapeParams(**scrape_params)
1414
+ params_dict = final_params.dict(exclude_none=True)
1415
+ params_dict['urls'] = urls
1416
+ params_dict['origin'] = f"python-sdk@{version}"
1417
+
1418
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1419
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1420
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1421
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1422
+
1423
+ # Make request
1424
+ headers = self._prepare_headers(idempotency_key)
1425
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1426
+
1427
+ if response.status_code == 200:
1428
+ try:
1429
+ return BatchScrapeResponse(**response.json())
1430
+ except:
1431
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1432
+ else:
1433
+ self._handle_error(response, 'start batch scrape job')
1434
+
1435
+ def batch_scrape_urls_and_watch(
1436
+ self,
1437
+ urls: List[str],
1438
+ *,
1439
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1440
+ headers: Optional[Dict[str, str]] = None,
1441
+ include_tags: Optional[List[str]] = None,
1442
+ exclude_tags: Optional[List[str]] = None,
1443
+ only_main_content: Optional[bool] = None,
1444
+ wait_for: Optional[int] = None,
1445
+ timeout: Optional[int] = None,
1446
+ location: Optional[LocationConfig] = None,
1447
+ mobile: Optional[bool] = None,
1448
+ skip_tls_verification: Optional[bool] = None,
1449
+ remove_base64_images: Optional[bool] = None,
1450
+ block_ads: Optional[bool] = None,
1451
+ proxy: Optional[Literal["basic", "stealth"]] = None,
1452
+ extract: Optional[JsonConfig] = None,
1453
+ json_options: Optional[JsonConfig] = None,
1454
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1455
+ agent: Optional[AgentOptions] = None,
1456
+ idempotency_key: Optional[str] = None,
1457
+ **kwargs
1458
+ ) -> 'CrawlWatcher':
1459
+ """
1460
+ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1461
+
1462
+ Args:
1463
+ urls (List[str]): URLs to scrape
1464
+ formats (Optional[List[Literal]]): Content formats to retrieve
1465
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
1466
+ include_tags (Optional[List[str]]): HTML tags to include
1467
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
1468
+ only_main_content (Optional[bool]): Extract main content only
1469
+ wait_for (Optional[int]): Wait time in milliseconds
1470
+ timeout (Optional[int]): Request timeout in milliseconds
1471
+ location (Optional[LocationConfig]): Location configuration
1472
+ mobile (Optional[bool]): Use mobile user agent
1473
+ skip_tls_verification (Optional[bool]): Skip TLS verification
1474
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
1475
+ block_ads (Optional[bool]): Block advertisements
1476
+ proxy (Optional[Literal]): Proxy type to use
1477
+ extract (Optional[JsonConfig]): Content extraction config
1478
+ json_options (Optional[JsonConfig]): JSON extraction config
1479
+ actions (Optional[List[Union]]): Actions to perform
1480
+ agent (Optional[AgentOptions]): Agent configuration
1481
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1482
+ **kwargs: Additional parameters to pass to the API
1483
+
1484
+ Returns:
1485
+ CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1486
+
1487
+ Raises:
1488
+ Exception: If batch scrape job fails to start
1489
+ """
1490
+ # Validate any additional kwargs
1491
+ self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1492
+
1493
+ scrape_params = {}
1494
+
1495
+ # Add individual parameters
1496
+ if formats is not None:
1497
+ scrape_params['formats'] = formats
1498
+ if headers is not None:
1499
+ scrape_params['headers'] = headers
1500
+ if include_tags is not None:
1501
+ scrape_params['includeTags'] = include_tags
1502
+ if exclude_tags is not None:
1503
+ scrape_params['excludeTags'] = exclude_tags
1504
+ if only_main_content is not None:
1505
+ scrape_params['onlyMainContent'] = only_main_content
1506
+ if wait_for is not None:
1507
+ scrape_params['waitFor'] = wait_for
1508
+ if timeout is not None:
1509
+ scrape_params['timeout'] = timeout
1510
+ if location is not None:
1511
+ scrape_params['location'] = location.dict(exclude_none=True)
1512
+ if mobile is not None:
1513
+ scrape_params['mobile'] = mobile
1514
+ if skip_tls_verification is not None:
1515
+ scrape_params['skipTlsVerification'] = skip_tls_verification
1516
+ if remove_base64_images is not None:
1517
+ scrape_params['removeBase64Images'] = remove_base64_images
1518
+ if block_ads is not None:
1519
+ scrape_params['blockAds'] = block_ads
1520
+ if proxy is not None:
1521
+ scrape_params['proxy'] = proxy
1522
+ if extract is not None:
1523
+ extract = self._ensure_schema_dict(extract)
1524
+ if isinstance(extract, dict) and "schema" in extract:
1525
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
1526
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1527
+ if json_options is not None:
1528
+ json_options = self._ensure_schema_dict(json_options)
1529
+ if isinstance(json_options, dict) and "schema" in json_options:
1530
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1531
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1532
+ if actions is not None:
1533
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1534
+ if agent is not None:
1535
+ scrape_params['agent'] = agent.dict(exclude_none=True)
1536
+
1537
+ # Add any additional kwargs
1538
+ scrape_params.update(kwargs)
1539
+
1540
+ # Create final params object
1541
+ final_params = ScrapeParams(**scrape_params)
1542
+ params_dict = final_params.dict(exclude_none=True)
1543
+ params_dict['urls'] = urls
1544
+ params_dict['origin'] = f"python-sdk@{version}"
1545
+
1546
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1547
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1548
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1549
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1550
+
1551
+ # Make request
1552
+ headers = self._prepare_headers(idempotency_key)
1553
+ response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1554
+
1555
+ if response.status_code == 200:
1556
+ try:
1557
+ crawl_response = BatchScrapeResponse(**response.json())
1558
+ if crawl_response.success and crawl_response.id:
1559
+ return CrawlWatcher(crawl_response.id, self)
1560
+ else:
1561
+ raise Exception("Batch scrape job failed to start")
1562
+ except:
1563
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1564
+ else:
1565
+ self._handle_error(response, 'start batch scrape job')
1566
+
1567
+ def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1568
+ """
1569
+ Check the status of a batch scrape job using the Firecrawl API.
1570
+
1571
+ Args:
1572
+ id (str): The ID of the batch scrape job.
1573
+
1574
+ Returns:
1575
+ BatchScrapeStatusResponse: The status of the batch scrape job.
1576
+
1577
+ Raises:
1578
+ Exception: If the status check request fails.
1579
+ """
1580
+ endpoint = f'/v1/batch/scrape/{id}'
1581
+
1582
+ headers = self._prepare_headers()
1583
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
1584
+ if response.status_code == 200:
1585
+ try:
1586
+ status_data = response.json()
1587
+ except:
1588
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1589
+ if status_data['status'] == 'completed':
1590
+ if 'data' in status_data:
1591
+ data = status_data['data']
1592
+ while 'next' in status_data:
1593
+ if len(status_data['data']) == 0:
1594
+ break
1595
+ next_url = status_data.get('next')
1596
+ if not next_url:
1597
+ logger.warning("Expected 'next' URL is missing.")
1598
+ break
1599
+ try:
1600
+ status_response = self._get_request(next_url, headers)
1601
+ if status_response.status_code != 200:
1602
+ logger.error(f"Failed to fetch next page: {status_response.status_code}")
1603
+ break
1604
+ try:
1605
+ next_data = status_response.json()
1606
+ except:
1607
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1608
+ data.extend(next_data.get('data', []))
1609
+ status_data = next_data
1610
+ except Exception as e:
1611
+ logger.error(f"Error during pagination request: {e}")
1612
+ break
1613
+ status_data['data'] = data
1614
+
1615
+ return BatchScrapeStatusResponse(**{
1616
+ 'success': False if 'error' in status_data else True,
1617
+ 'status': status_data.get('status'),
1618
+ 'total': status_data.get('total'),
1619
+ 'completed': status_data.get('completed'),
1620
+ 'creditsUsed': status_data.get('creditsUsed'),
1621
+ 'expiresAt': status_data.get('expiresAt'),
1622
+ 'data': status_data.get('data'),
1623
+ 'next': status_data.get('next'),
1624
+ 'error': status_data.get('error')
1625
+ })
1626
+ else:
1627
+ self._handle_error(response, 'check batch scrape status')
1628
+
1629
+ def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1630
+ """
1631
+ Returns information about batch scrape errors.
1632
+
1633
+ Args:
1634
+ id (str): The ID of the crawl job.
1635
+
1636
+ Returns:
1637
+ CrawlErrorsResponse containing:
1638
+ * errors (List[Dict[str, str]]): List of errors with fields:
1639
+ * id (str): Error ID
1640
+ * timestamp (str): When the error occurred
1641
+ * url (str): URL that caused the error
1642
+ * error (str): Error message
1643
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1644
+
1645
+ Raises:
1646
+ Exception: If the error check request fails
1647
+ """
1648
+ headers = self._prepare_headers()
1649
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1650
+ if response.status_code == 200:
1651
+ try:
1652
+ return CrawlErrorsResponse(**response.json())
1653
+ except:
1654
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1655
+ else:
1656
+ self._handle_error(response, "check batch scrape errors")
1657
+
1658
+ def extract(
1659
+ self,
1660
+ urls: Optional[List[str]] = None,
1661
+ *,
1662
+ prompt: Optional[str] = None,
1663
+ schema: Optional[Any] = None,
1664
+ system_prompt: Optional[str] = None,
1665
+ allow_external_links: Optional[bool] = False,
1666
+ enable_web_search: Optional[bool] = False,
1667
+ show_sources: Optional[bool] = False,
1668
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1669
+ """
1670
+ Extract structured information from URLs.
1671
+
1672
+ Args:
1673
+ urls (Optional[List[str]]): URLs to extract from
1674
+ prompt (Optional[str]): Custom extraction prompt
1675
+ schema (Optional[Any]): JSON schema/Pydantic model
1676
+ system_prompt (Optional[str]): System context
1677
+ allow_external_links (Optional[bool]): Follow external links
1678
+ enable_web_search (Optional[bool]): Enable web search
1679
+ show_sources (Optional[bool]): Include source URLs
1680
+ agent (Optional[Dict[str, Any]]): Agent configuration
1681
+
1682
+ Returns:
1683
+ ExtractResponse[Any] with:
1684
+ * success (bool): Whether request succeeded
1685
+ * data (Optional[Any]): Extracted data matching schema
1686
+ * error (Optional[str]): Error message if any
1687
+
1688
+ Raises:
1689
+ ValueError: If prompt/schema missing or extraction fails
1690
+ """
1691
+ headers = self._prepare_headers()
1692
+
1693
+ if not prompt and not schema:
1694
+ raise ValueError("Either prompt or schema is required")
1695
+
1696
+ if not urls and not prompt:
1697
+ raise ValueError("Either urls or prompt is required")
1698
+
1699
+ if schema:
1700
+ schema = self._ensure_schema_dict(schema)
1701
+
1702
+ request_data = {
1703
+ 'urls': urls or [],
1704
+ 'allowExternalLinks': allow_external_links,
1705
+ 'enableWebSearch': enable_web_search,
1706
+ 'showSources': show_sources,
1707
+ 'schema': schema,
1708
+ 'origin': f'python-sdk@{get_version()}'
1709
+ }
1710
+
1711
+ # Only add prompt and systemPrompt if they exist
1712
+ if prompt:
1713
+ request_data['prompt'] = prompt
1714
+ if system_prompt:
1715
+ request_data['systemPrompt'] = system_prompt
1716
+
1717
+ if agent:
1718
+ request_data['agent'] = agent
1719
+
1720
+ try:
1721
+ # Send the initial extract request
1722
+ response = self._post_request(
1723
+ f'{self.api_url}/v1/extract',
1724
+ request_data,
1725
+ headers
1726
+ )
1727
+ if response.status_code == 200:
1728
+ try:
1729
+ data = response.json()
1730
+ except:
1731
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1732
+ if data['success']:
1733
+ job_id = data.get('id')
1734
+ if not job_id:
1735
+ raise Exception('Job ID not returned from extract request.')
1736
+
1737
+ # Poll for the extract status
1738
+ while True:
1739
+ status_response = self._get_request(
1740
+ f'{self.api_url}/v1/extract/{job_id}',
1741
+ headers
1742
+ )
1743
+ if status_response.status_code == 200:
1744
+ try:
1745
+ status_data = status_response.json()
1746
+ except:
1747
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1748
+ if status_data['status'] == 'completed':
1749
+ return ExtractResponse(**status_data)
1750
+ elif status_data['status'] in ['failed', 'cancelled']:
1751
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1752
+ else:
1753
+ self._handle_error(status_response, "extract-status")
1754
+
1755
+ time.sleep(2) # Polling interval
1756
+ else:
1757
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
1758
+ else:
1759
+ self._handle_error(response, "extract")
1760
+ except Exception as e:
1761
+ raise ValueError(str(e), 500)
1762
+
1763
+ return ExtractResponse(success=False, error="Internal server error.")
1764
+
1765
+ def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1766
+ """
1767
+ Retrieve the status of an extract job.
1768
+
1769
+ Args:
1770
+ job_id (str): The ID of the extract job.
1771
+
1772
+ Returns:
1773
+ ExtractResponse[Any]: The status of the extract job.
1774
+
1775
+ Raises:
1776
+ ValueError: If there is an error retrieving the status.
1777
+ """
1778
+ headers = self._prepare_headers()
1779
+ try:
1780
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1781
+ if response.status_code == 200:
1782
+ try:
1783
+ return ExtractResponse(**response.json())
1784
+ except:
1785
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1786
+ else:
1787
+ self._handle_error(response, "get extract status")
1788
+ except Exception as e:
1789
+ raise ValueError(str(e), 500)
1790
+
1791
+ def async_extract(
1792
+ self,
1793
+ urls: Optional[List[str]] = None,
1794
+ *,
1795
+ prompt: Optional[str] = None,
1796
+ schema: Optional[Any] = None,
1797
+ system_prompt: Optional[str] = None,
1798
+ allow_external_links: Optional[bool] = False,
1799
+ enable_web_search: Optional[bool] = False,
1800
+ show_sources: Optional[bool] = False,
1801
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1802
+ """
1803
+ Initiate an asynchronous extract job.
1804
+
1805
+ Args:
1806
+ urls (List[str]): URLs to extract information from
1807
+ prompt (Optional[str]): Custom extraction prompt
1808
+ schema (Optional[Any]): JSON schema/Pydantic model
1809
+ system_prompt (Optional[str]): System context
1810
+ allow_external_links (Optional[bool]): Follow external links
1811
+ enable_web_search (Optional[bool]): Enable web search
1812
+ show_sources (Optional[bool]): Include source URLs
1813
+ agent (Optional[Dict[str, Any]]): Agent configuration
1814
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1815
+
1816
+ Returns:
1817
+ ExtractResponse[Any] with:
1818
+ * success (bool): Whether request succeeded
1819
+ * data (Optional[Any]): Extracted data matching schema
1820
+ * error (Optional[str]): Error message if any
1821
+
1822
+ Raises:
1823
+ ValueError: If job initiation fails
1824
+ """
1825
+ headers = self._prepare_headers()
1826
+
1827
+ schema = schema
1828
+ if schema:
1829
+ schema = self._ensure_schema_dict(schema)
1830
+
1831
+ request_data = {
1832
+ 'urls': urls,
1833
+ 'allowExternalLinks': allow_external_links,
1834
+ 'enableWebSearch': enable_web_search,
1835
+ 'showSources': show_sources,
1836
+ 'schema': schema,
1837
+ 'origin': f'python-sdk@{version}'
1838
+ }
1839
+
1840
+ if prompt:
1841
+ request_data['prompt'] = prompt
1842
+ if system_prompt:
1843
+ request_data['systemPrompt'] = system_prompt
1844
+ if agent:
1845
+ request_data['agent'] = agent
1846
+
1847
+ try:
1848
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1849
+ if response.status_code == 200:
1850
+ try:
1851
+ return ExtractResponse(**response.json())
1852
+ except:
1853
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
1854
+ else:
1855
+ self._handle_error(response, "async extract")
1856
+ except Exception as e:
1857
+ raise ValueError(str(e), 500)
1858
+
1859
+ def generate_llms_text(
1860
+ self,
1861
+ url: str,
1862
+ *,
1863
+ max_urls: Optional[int] = None,
1864
+ show_full_text: Optional[bool] = None,
1865
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1866
+ """
1867
+ Generate LLMs.txt for a given URL and poll until completion.
1868
+
1869
+ Args:
1870
+ url (str): Target URL to generate LLMs.txt from
1871
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1872
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1873
+ experimental_stream (Optional[bool]): Enable experimental streaming
1874
+
1875
+ Returns:
1876
+ GenerateLLMsTextStatusResponse with:
1877
+ * Generated LLMs.txt content
1878
+ * Full version if requested
1879
+ * Generation status
1880
+ * Success/error information
1881
+
1882
+ Raises:
1883
+ Exception: If generation fails
1884
+ """
1885
+ params = GenerateLLMsTextParams(
1886
+ maxUrls=max_urls,
1887
+ showFullText=show_full_text,
1888
+ __experimental_stream=experimental_stream
1889
+ )
1890
+
1891
+ response = self.async_generate_llms_text(
1892
+ url,
1893
+ max_urls=max_urls,
1894
+ show_full_text=show_full_text,
1895
+ experimental_stream=experimental_stream
1896
+ )
1897
+
1898
+ if not response.success or not response.id:
1899
+ return GenerateLLMsTextStatusResponse(
1900
+ success=False,
1901
+ error='Failed to start LLMs.txt generation',
1902
+ status='failed',
1903
+ expiresAt=''
1904
+ )
1905
+
1906
+ job_id = response.id
1907
+ while True:
1908
+ status = self.check_generate_llms_text_status(job_id)
1909
+
1910
+ if status.status == 'completed':
1911
+ return status
1912
+ elif status.status == 'failed':
1913
+ return status
1914
+ elif status.status != 'processing':
1915
+ return GenerateLLMsTextStatusResponse(
1916
+ success=False,
1917
+ error='LLMs.txt generation job terminated unexpectedly',
1918
+ status='failed',
1919
+ expiresAt=''
1920
+ )
1921
+
1922
+ time.sleep(2) # Polling interval
1923
+
1924
+ def async_generate_llms_text(
1925
+ self,
1926
+ url: str,
1927
+ *,
1928
+ max_urls: Optional[int] = None,
1929
+ show_full_text: Optional[bool] = None,
1930
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1931
+ """
1932
+ Initiate an asynchronous LLMs.txt generation operation.
1933
+
1934
+ Args:
1935
+ url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1936
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
1937
+ show_full_text (Optional[bool]): Include full text in output (default: False)
1938
+ experimental_stream (Optional[bool]): Enable experimental streaming
1939
+
1940
+ Returns:
1941
+ GenerateLLMsTextResponse: A response containing:
1942
+ * success (bool): Whether the generation initiation was successful
1943
+ * id (str): The unique identifier for the generation job
1944
+ * error (str, optional): Error message if initiation failed
1945
+
1946
+ Raises:
1947
+ Exception: If the generation job initiation fails.
1948
+ """
1949
+ params = GenerateLLMsTextParams(
1950
+ maxUrls=max_urls,
1951
+ showFullText=show_full_text,
1952
+ __experimental_stream=experimental_stream
1953
+ )
1954
+
1955
+ headers = self._prepare_headers()
1956
+ json_data = {'url': url, **params.dict(exclude_none=True)}
1957
+ json_data['origin'] = f"python-sdk@{version}"
1958
+
1959
+ try:
1960
+ req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1961
+ response = req.json()
1962
+ print("json_data", json_data)
1963
+ print("response", response)
1964
+ if response.get('success'):
1965
+ try:
1966
+ return GenerateLLMsTextResponse(**response)
1967
+ except:
1968
+ raise Exception('Failed to parse Firecrawl response as JSON.')
1969
+ else:
1970
+ self._handle_error(response, 'start LLMs.txt generation')
1971
+ except Exception as e:
1972
+ raise ValueError(str(e))
1973
+
1974
+ return GenerateLLMsTextResponse(
1975
+ success=False,
1976
+ error='Internal server error'
1977
+ )
1978
+
1979
+ def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
1980
+ """
1981
+ Check the status of a LLMs.txt generation operation.
1982
+
1983
+ Args:
1984
+ id (str): The unique identifier of the LLMs.txt generation job to check status for.
1985
+
1986
+ Returns:
1987
+ GenerateLLMsTextStatusResponse: A response containing:
1988
+ * success (bool): Whether the generation was successful
1989
+ * status (str): Status of generation ("processing", "completed", "failed")
1990
+ * data (Dict[str, str], optional): Generated text with fields:
1991
+ * llmstxt (str): Generated LLMs.txt content
1992
+ * llmsfulltxt (str, optional): Full version if requested
1993
+ * error (str, optional): Error message if generation failed
1994
+ * expiresAt (str): When the generated data expires
1995
+
1996
+ Raises:
1997
+ Exception: If the status check fails.
1998
+ """
1999
+ headers = self._prepare_headers()
2000
+ try:
2001
+ response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2002
+ if response.status_code == 200:
2003
+ try:
2004
+ json_data = response.json()
2005
+ return GenerateLLMsTextStatusResponse(**json_data)
2006
+ except Exception as e:
2007
+ raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2008
+ elif response.status_code == 404:
2009
+ raise Exception('LLMs.txt generation job not found')
2010
+ else:
2011
+ self._handle_error(response, 'check LLMs.txt generation status')
2012
+ except Exception as e:
2013
+ raise ValueError(str(e))
2014
+
2015
+ return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2016
+
2017
+ def _prepare_headers(
2018
+ self,
2019
+ idempotency_key: Optional[str] = None) -> Dict[str, str]:
2020
+ """
2021
+ Prepare the headers for API requests.
2022
+
2023
+ Args:
2024
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2025
+
2026
+ Returns:
2027
+ Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2028
+ """
2029
+ if idempotency_key:
2030
+ return {
2031
+ 'Content-Type': 'application/json',
2032
+ 'Authorization': f'Bearer {self.api_key}',
2033
+ 'x-idempotency-key': idempotency_key
2034
+ }
2035
+
2036
+ return {
2037
+ 'Content-Type': 'application/json',
2038
+ 'Authorization': f'Bearer {self.api_key}',
2039
+ }
2040
+
2041
+ def _post_request(
2042
+ self,
2043
+ url: str,
2044
+ data: Dict[str, Any],
2045
+ headers: Dict[str, str],
2046
+ retries: int = 3,
2047
+ backoff_factor: float = 0.5) -> requests.Response:
2048
+ """
2049
+ Make a POST request with retries.
2050
+
2051
+ Args:
2052
+ url (str): The URL to send the POST request to.
2053
+ data (Dict[str, Any]): The JSON data to include in the POST request.
2054
+ headers (Dict[str, str]): The headers to include in the POST request.
2055
+ retries (int): Number of retries for the request.
2056
+ backoff_factor (float): Backoff factor for retries.
2057
+
2058
+ Returns:
2059
+ requests.Response: The response from the POST request.
2060
+
2061
+ Raises:
2062
+ requests.RequestException: If the request fails after the specified retries.
2063
+ """
2064
+ for attempt in range(retries):
2065
+ response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2066
+ if response.status_code == 502:
2067
+ time.sleep(backoff_factor * (2 ** attempt))
2068
+ else:
2069
+ return response
2070
+ return response
2071
+
2072
+ def _get_request(
2073
+ self,
2074
+ url: str,
2075
+ headers: Dict[str, str],
2076
+ retries: int = 3,
2077
+ backoff_factor: float = 0.5) -> requests.Response:
2078
+ """
2079
+ Make a GET request with retries.
2080
+
2081
+ Args:
2082
+ url (str): The URL to send the GET request to.
2083
+ headers (Dict[str, str]): The headers to include in the GET request.
2084
+ retries (int): Number of retries for the request.
2085
+ backoff_factor (float): Backoff factor for retries.
2086
+
2087
+ Returns:
2088
+ requests.Response: The response from the GET request.
2089
+
2090
+ Raises:
2091
+ requests.RequestException: If the request fails after the specified retries.
2092
+ """
2093
+ for attempt in range(retries):
2094
+ response = requests.get(url, headers=headers)
2095
+ if response.status_code == 502:
2096
+ time.sleep(backoff_factor * (2 ** attempt))
2097
+ else:
2098
+ return response
2099
+ return response
2100
+
2101
+ def _delete_request(
2102
+ self,
2103
+ url: str,
2104
+ headers: Dict[str, str],
2105
+ retries: int = 3,
2106
+ backoff_factor: float = 0.5) -> requests.Response:
2107
+ """
2108
+ Make a DELETE request with retries.
2109
+
2110
+ Args:
2111
+ url (str): The URL to send the DELETE request to.
2112
+ headers (Dict[str, str]): The headers to include in the DELETE request.
2113
+ retries (int): Number of retries for the request.
2114
+ backoff_factor (float): Backoff factor for retries.
2115
+
2116
+ Returns:
2117
+ requests.Response: The response from the DELETE request.
2118
+
2119
+ Raises:
2120
+ requests.RequestException: If the request fails after the specified retries.
2121
+ """
2122
+ for attempt in range(retries):
2123
+ response = requests.delete(url, headers=headers)
2124
+ if response.status_code == 502:
2125
+ time.sleep(backoff_factor * (2 ** attempt))
2126
+ else:
2127
+ return response
2128
+ return response
2129
+
2130
+ def _monitor_job_status(
2131
+ self,
2132
+ id: str,
2133
+ headers: Dict[str, str],
2134
+ poll_interval: int) -> CrawlStatusResponse:
2135
+ """
2136
+ Monitor the status of a crawl job until completion.
2137
+
2138
+ Args:
2139
+ id (str): The ID of the crawl job.
2140
+ headers (Dict[str, str]): The headers to include in the status check requests.
2141
+ poll_interval (int): Seconds between status checks.
2142
+
2143
+ Returns:
2144
+ CrawlStatusResponse: The crawl results if the job is completed successfully.
2145
+
2146
+ Raises:
2147
+ Exception: If the job fails or an error occurs during status checks.
2148
+ """
2149
+ while True:
2150
+ api_url = f'{self.api_url}/v1/crawl/{id}'
2151
+
2152
+ status_response = self._get_request(api_url, headers)
2153
+ if status_response.status_code == 200:
2154
+ try:
2155
+ status_data = status_response.json()
2156
+ except:
2157
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2158
+ if status_data['status'] == 'completed':
2159
+ if 'data' in status_data:
2160
+ data = status_data['data']
2161
+ while 'next' in status_data:
2162
+ if len(status_data['data']) == 0:
2163
+ break
2164
+ status_response = self._get_request(status_data['next'], headers)
2165
+ try:
2166
+ status_data = status_response.json()
2167
+ except:
2168
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
2169
+ data.extend(status_data.get('data', []))
2170
+ status_data['data'] = data
2171
+ return CrawlStatusResponse(**status_data)
2172
+ else:
2173
+ raise Exception('Crawl job completed but no data was returned')
2174
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2175
+ poll_interval=max(poll_interval,2)
2176
+ time.sleep(poll_interval) # Wait for the specified interval before checking again
2177
+ else:
2178
+ raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2179
+ else:
2180
+ self._handle_error(status_response, 'check crawl status')
2181
+
2182
+ def _handle_error(
2183
+ self,
2184
+ response: requests.Response,
2185
+ action: str) -> None:
2186
+ """
2187
+ Handle errors from API responses.
2188
+
2189
+ Args:
2190
+ response (requests.Response): The response object from the API request.
2191
+ action (str): Description of the action that was being performed.
2192
+
2193
+ Raises:
2194
+ Exception: An exception with a message containing the status code and error details from the response.
2195
+ """
2196
+ try:
2197
+ error_message = response.json().get('error', 'No error message provided.')
2198
+ error_details = response.json().get('details', 'No additional error details provided.')
2199
+ except:
2200
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2201
+
2202
+ message = self._get_error_message(response.status_code, action, error_message, error_details)
2203
+
2204
+ # Raise an HTTPError with the custom message and attach the response
2205
+ raise requests.exceptions.HTTPError(message, response=response)
2206
+
2207
+ def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2208
+ """
2209
+ Generate a standardized error message based on HTTP status code.
2210
+
2211
+ Args:
2212
+ status_code (int): The HTTP status code from the response
2213
+ action (str): Description of the action that was being performed
2214
+ error_message (str): The error message from the API response
2215
+ error_details (str): Additional error details from the API response
2216
+
2217
+ Returns:
2218
+ str: A formatted error message
2219
+ """
2220
+ if status_code == 402:
2221
+ return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2222
+ elif status_code == 403:
2223
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2224
+ elif status_code == 408:
2225
+ return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2226
+ elif status_code == 409:
2227
+ return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2228
+ elif status_code == 500:
2229
+ return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2230
+ else:
2231
+ return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2232
+
2233
+ def deep_research(
2234
+ self,
2235
+ query: str,
2236
+ *,
2237
+ max_depth: Optional[int] = None,
2238
+ time_limit: Optional[int] = None,
2239
+ max_urls: Optional[int] = None,
2240
+ analysis_prompt: Optional[str] = None,
2241
+ system_prompt: Optional[str] = None,
2242
+ __experimental_stream_steps: Optional[bool] = None,
2243
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2244
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2245
+ """
2246
+ Initiates a deep research operation on a given query and polls until completion.
2247
+
2248
+ Args:
2249
+ query (str): Research query or topic to investigate
2250
+ max_depth (Optional[int]): Maximum depth of research exploration
2251
+ time_limit (Optional[int]): Time limit in seconds for research
2252
+ max_urls (Optional[int]): Maximum number of URLs to process
2253
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2254
+ system_prompt (Optional[str]): Custom system prompt
2255
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2256
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2257
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2258
+
2259
+ Returns:
2260
+ DeepResearchStatusResponse containing:
2261
+ * success (bool): Whether research completed successfully
2262
+ * status (str): Current state (processing/completed/failed)
2263
+ * error (Optional[str]): Error message if failed
2264
+ * id (str): Unique identifier for the research job
2265
+ * data (Any): Research findings and analysis
2266
+ * sources (List[Dict]): List of discovered sources
2267
+ * activities (List[Dict]): Research progress log
2268
+ * summaries (List[str]): Generated research summaries
2269
+
2270
+ Raises:
2271
+ Exception: If research fails
2272
+ """
2273
+ research_params = {}
2274
+ if max_depth is not None:
2275
+ research_params['maxDepth'] = max_depth
2276
+ if time_limit is not None:
2277
+ research_params['timeLimit'] = time_limit
2278
+ if max_urls is not None:
2279
+ research_params['maxUrls'] = max_urls
2280
+ if analysis_prompt is not None:
2281
+ research_params['analysisPrompt'] = analysis_prompt
2282
+ if system_prompt is not None:
2283
+ research_params['systemPrompt'] = system_prompt
2284
+ if __experimental_stream_steps is not None:
2285
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2286
+ research_params = DeepResearchParams(**research_params)
2287
+
2288
+ response = self.async_deep_research(
2289
+ query,
2290
+ max_depth=max_depth,
2291
+ time_limit=time_limit,
2292
+ max_urls=max_urls,
2293
+ analysis_prompt=analysis_prompt,
2294
+ system_prompt=system_prompt
2295
+ )
2296
+ if not response.get('success') or 'id' not in response:
2297
+ return response
2298
+
2299
+ job_id = response['id']
2300
+ last_activity_count = 0
2301
+ last_source_count = 0
2302
+
2303
+ while True:
2304
+ status = self.check_deep_research_status(job_id)
2305
+
2306
+ if on_activity and 'activities' in status:
2307
+ new_activities = status['activities'][last_activity_count:]
2308
+ for activity in new_activities:
2309
+ on_activity(activity)
2310
+ last_activity_count = len(status['activities'])
2311
+
2312
+ if on_source and 'sources' in status:
2313
+ new_sources = status['sources'][last_source_count:]
2314
+ for source in new_sources:
2315
+ on_source(source)
2316
+ last_source_count = len(status['sources'])
2317
+
2318
+ if status['status'] == 'completed':
2319
+ return status
2320
+ elif status['status'] == 'failed':
2321
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
2322
+ elif status['status'] != 'processing':
2323
+ break
2324
+
2325
+ time.sleep(2) # Polling interval
2326
+
2327
+ return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2328
+
2329
+ def async_deep_research(
2330
+ self,
2331
+ query: str,
2332
+ *,
2333
+ max_depth: Optional[int] = None,
2334
+ time_limit: Optional[int] = None,
2335
+ max_urls: Optional[int] = None,
2336
+ analysis_prompt: Optional[str] = None,
2337
+ system_prompt: Optional[str] = None,
2338
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2339
+ """
2340
+ Initiates an asynchronous deep research operation.
2341
+
2342
+ Args:
2343
+ query (str): Research query or topic to investigate
2344
+ max_depth (Optional[int]): Maximum depth of research exploration
2345
+ time_limit (Optional[int]): Time limit in seconds for research
2346
+ max_urls (Optional[int]): Maximum number of URLs to process
2347
+ analysis_prompt (Optional[str]): Custom prompt for analysis
2348
+ system_prompt (Optional[str]): Custom system prompt
2349
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2350
+
2351
+ Returns:
2352
+ Dict[str, Any]: A response containing:
2353
+ * success (bool): Whether the research initiation was successful
2354
+ * id (str): The unique identifier for the research job
2355
+ * error (str, optional): Error message if initiation failed
2356
+
2357
+ Raises:
2358
+ Exception: If the research initiation fails.
2359
+ """
2360
+ research_params = {}
2361
+ if max_depth is not None:
2362
+ research_params['maxDepth'] = max_depth
2363
+ if time_limit is not None:
2364
+ research_params['timeLimit'] = time_limit
2365
+ if max_urls is not None:
2366
+ research_params['maxUrls'] = max_urls
2367
+ if analysis_prompt is not None:
2368
+ research_params['analysisPrompt'] = analysis_prompt
2369
+ if system_prompt is not None:
2370
+ research_params['systemPrompt'] = system_prompt
2371
+ if __experimental_stream_steps is not None:
2372
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
2373
+ research_params = DeepResearchParams(**research_params)
2374
+
2375
+ headers = self._prepare_headers()
2376
+
2377
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
2378
+ json_data['origin'] = f"python-sdk@{version}"
2379
+
2380
+ # Handle json options schema if present
2381
+ if 'jsonOptions' in json_data:
2382
+ json_opts = json_data['jsonOptions']
2383
+ if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2384
+ json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2385
+
2386
+ try:
2387
+ response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2388
+ if response.status_code == 200:
2389
+ try:
2390
+ return response.json()
2391
+ except:
2392
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2393
+ else:
2394
+ self._handle_error(response, 'start deep research')
2395
+ except Exception as e:
2396
+ raise ValueError(str(e))
2397
+
2398
+ return {'success': False, 'error': 'Internal server error'}
2399
+
2400
+ def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2401
+ """
2402
+ Check the status of a deep research operation.
2403
+
2404
+ Args:
2405
+ id (str): The ID of the deep research operation.
2406
+
2407
+ Returns:
2408
+ DeepResearchResponse containing:
2409
+
2410
+ Status:
2411
+ * success - Whether research completed successfully
2412
+ * status - Current state (processing/completed/failed)
2413
+ * error - Error message if failed
2414
+
2415
+ Results:
2416
+ * id - Unique identifier for the research job
2417
+ * data - Research findings and analysis
2418
+ * sources - List of discovered sources
2419
+ * activities - Research progress log
2420
+ * summaries - Generated research summaries
2421
+
2422
+ Raises:
2423
+ Exception: If the status check fails.
2424
+ """
2425
+ headers = self._prepare_headers()
2426
+ try:
2427
+ response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2428
+ if response.status_code == 200:
2429
+ try:
2430
+ return response.json()
2431
+ except:
2432
+ raise Exception('Failed to parse Firecrawl response as JSON.')
2433
+ elif response.status_code == 404:
2434
+ raise Exception('Deep research job not found')
2435
+ else:
2436
+ self._handle_error(response, 'check deep research status')
2437
+ except Exception as e:
2438
+ raise ValueError(str(e))
2439
+
2440
+ return {'success': False, 'error': 'Internal server error'}
2441
+
2442
+ def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2443
+ """
2444
+ Validate additional keyword arguments before they are passed to the API.
2445
+ This provides early validation before the Pydantic model validation.
2446
+
2447
+ Args:
2448
+ kwargs (Dict[str, Any]): Additional keyword arguments to validate
2449
+ method_name (str): Name of the method these kwargs are for
2450
+
2451
+ Raises:
2452
+ ValueError: If kwargs contain invalid or unsupported parameters
2453
+ """
2454
+ if not kwargs:
2455
+ return
2456
+
2457
+ # Known parameter mappings for each method
2458
+ method_params = {
2459
+ "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2460
+ "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2461
+ "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
2462
+ "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2463
+ "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2464
+ "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2465
+ "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2466
+ "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2467
+ "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2468
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2469
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2470
+ "actions", "agent", "webhook"},
2471
+ "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2472
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2473
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2474
+ "actions", "agent", "webhook"},
2475
+ "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2476
+ "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2477
+ "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2478
+ "actions", "agent", "webhook"}
2479
+ }
2480
+
2481
+ # Get allowed parameters for this method
2482
+ allowed_params = method_params.get(method_name, set())
2483
+
2484
+ # Check for unknown parameters
2485
+ unknown_params = set(kwargs.keys()) - allowed_params
2486
+ if unknown_params:
2487
+ raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2488
+
2489
+ # Additional type validation can be added here if needed
2490
+ # For now, we rely on Pydantic models for detailed type validation
2491
+
2492
+ def _ensure_schema_dict(self, schema):
2493
+ """
2494
+ Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2495
+ """
2496
+ if schema is None:
2497
+ return schema
2498
+ if isinstance(schema, type):
2499
+ # Pydantic v1/v2 model class
2500
+ if hasattr(schema, 'model_json_schema'):
2501
+ return schema.model_json_schema()
2502
+ elif hasattr(schema, 'schema'):
2503
+ return schema.schema()
2504
+ if isinstance(schema, dict):
2505
+ return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2506
+ if isinstance(schema, (list, tuple)):
2507
+ return [self._ensure_schema_dict(v) for v in schema]
2508
+ return schema
2509
+
2510
+ class CrawlWatcher:
2511
+ """
2512
+ A class to watch and handle crawl job events via WebSocket connection.
2513
+
2514
+ Attributes:
2515
+ id (str): The ID of the crawl job to watch
2516
+ app (FirecrawlApp): The FirecrawlApp instance
2517
+ data (List[Dict[str, Any]]): List of crawled documents/data
2518
+ status (str): Current status of the crawl job
2519
+ ws_url (str): WebSocket URL for the crawl job
2520
+ event_handlers (dict): Dictionary of event type to list of handler functions
2521
+ """
2522
+ def __init__(self, id: str, app: FirecrawlApp):
2523
+ self.id = id
2524
+ self.app = app
2525
+ self.data: List[Dict[str, Any]] = []
2526
+ self.status = "scraping"
2527
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2528
+ self.event_handlers = {
2529
+ 'done': [],
2530
+ 'error': [],
2531
+ 'document': []
2532
+ }
2533
+
2534
+ async def connect(self) -> None:
2535
+ """
2536
+ Establishes WebSocket connection and starts listening for messages.
2537
+ """
2538
+ async with websockets.connect(
2539
+ self.ws_url,
2540
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2541
+ ) as websocket:
2542
+ await self._listen(websocket)
2543
+
2544
+ async def _listen(self, websocket) -> None:
2545
+ """
2546
+ Listens for incoming WebSocket messages and handles them.
2547
+
2548
+ Args:
2549
+ websocket: The WebSocket connection object
2550
+ """
2551
+ async for message in websocket:
2552
+ msg = json.loads(message)
2553
+ await self._handle_message(msg)
2554
+
2555
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2556
+ """
2557
+ Adds an event handler function for a specific event type.
2558
+
2559
+ Args:
2560
+ event_type (str): Type of event to listen for ('done', 'error', or 'document')
2561
+ handler (Callable): Function to handle the event
2562
+ """
2563
+ if event_type in self.event_handlers:
2564
+ self.event_handlers[event_type].append(handler)
2565
+
2566
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2567
+ """
2568
+ Dispatches an event to all registered handlers for that event type.
2569
+
2570
+ Args:
2571
+ event_type (str): Type of event to dispatch
2572
+ detail (Dict[str, Any]): Event details/data to pass to handlers
2573
+ """
2574
+ if event_type in self.event_handlers:
2575
+ for handler in self.event_handlers[event_type]:
2576
+ handler(detail)
2577
+
2578
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
2579
+ """
2580
+ Handles incoming WebSocket messages based on their type.
2581
+
2582
+ Args:
2583
+ msg (Dict[str, Any]): The message to handle
2584
+ """
2585
+ if msg['type'] == 'done':
2586
+ self.status = 'completed'
2587
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2588
+ elif msg['type'] == 'error':
2589
+ self.status = 'failed'
2590
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2591
+ elif msg['type'] == 'catchup':
2592
+ self.status = msg['data']['status']
2593
+ self.data.extend(msg['data'].get('data', []))
2594
+ for doc in self.data:
2595
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
2596
+ elif msg['type'] == 'document':
2597
+ self.data.append(msg['data'])
2598
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2599
+
2600
+ class AsyncFirecrawlApp(FirecrawlApp):
2601
+ """
2602
+ Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2603
+ Provides non-blocking alternatives to all FirecrawlApp operations.
2604
+ """
2605
+
2606
+ async def _async_request(
2607
+ self,
2608
+ method: str,
2609
+ url: str,
2610
+ headers: Dict[str, str],
2611
+ data: Optional[Dict[str, Any]] = None,
2612
+ retries: int = 3,
2613
+ backoff_factor: float = 0.5) -> Dict[str, Any]:
2614
+ """
2615
+ Generic async request method with exponential backoff retry logic.
2616
+
2617
+ Args:
2618
+ method (str): The HTTP method to use (e.g., "GET" or "POST").
2619
+ url (str): The URL to send the request to.
2620
+ headers (Dict[str, str]): Headers to include in the request.
2621
+ data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2622
+ retries (int): Maximum number of retry attempts (default: 3).
2623
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2624
+ Delay will be backoff_factor * (2 ** retry_count).
2625
+
2626
+ Returns:
2627
+ Dict[str, Any]: The parsed JSON response from the server.
2628
+
2629
+ Raises:
2630
+ aiohttp.ClientError: If the request fails after all retries.
2631
+ Exception: If max retries are exceeded or other errors occur.
2632
+ """
2633
+ async with aiohttp.ClientSession() as session:
2634
+ for attempt in range(retries):
2635
+ try:
2636
+ async with session.request(
2637
+ method=method, url=url, headers=headers, json=data
2638
+ ) as response:
2639
+ if response.status == 502:
2640
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2641
+ continue
2642
+ if response.status >= 300:
2643
+ await self._handle_error(response, f"make {method} request")
2644
+ return await response.json()
2645
+ except aiohttp.ClientError as e:
2646
+ if attempt == retries - 1:
2647
+ raise e
2648
+ await asyncio.sleep(backoff_factor * (2 ** attempt))
2649
+ raise Exception("Max retries exceeded")
2650
+
2651
+ async def _async_post_request(
2652
+ self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2653
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2654
+ """
2655
+ Make an async POST request with exponential backoff retry logic.
2656
+
2657
+ Args:
2658
+ url (str): The URL to send the POST request to.
2659
+ data (Dict[str, Any]): The JSON data to include in the request body.
2660
+ headers (Dict[str, str]): Headers to include in the request.
2661
+ retries (int): Maximum number of retry attempts (default: 3).
2662
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2663
+ Delay will be backoff_factor * (2 ** retry_count).
2664
+
2665
+ Returns:
2666
+ Dict[str, Any]: The parsed JSON response from the server.
2667
+
2668
+ Raises:
2669
+ aiohttp.ClientError: If the request fails after all retries.
2670
+ Exception: If max retries are exceeded or other errors occur.
2671
+ """
2672
+ return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2673
+
2674
+ async def _async_get_request(
2675
+ self, url: str, headers: Dict[str, str],
2676
+ retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2677
+ """
2678
+ Make an async GET request with exponential backoff retry logic.
2679
+
2680
+ Args:
2681
+ url (str): The URL to send the GET request to.
2682
+ headers (Dict[str, str]): Headers to include in the request.
2683
+ retries (int): Maximum number of retry attempts (default: 3).
2684
+ backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2685
+ Delay will be backoff_factor * (2 ** retry_count).
2686
+
2687
+ Returns:
2688
+ Dict[str, Any]: The parsed JSON response from the server.
2689
+
2690
+ Raises:
2691
+ aiohttp.ClientError: If the request fails after all retries.
2692
+ Exception: If max retries are exceeded or other errors occur.
2693
+ """
2694
+ return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2695
+
2696
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2697
+ """
2698
+ Handle errors from async API responses with detailed error messages.
2699
+
2700
+ Args:
2701
+ response (aiohttp.ClientResponse): The response object from the failed request
2702
+ action (str): Description of the action that was being attempted
2703
+
2704
+ Raises:
2705
+ aiohttp.ClientError: With a detailed error message based on the response status:
2706
+ - 402: Payment Required
2707
+ - 408: Request Timeout
2708
+ - 409: Conflict
2709
+ - 500: Internal Server Error
2710
+ - Other: Unexpected error with status code
2711
+ """
2712
+ try:
2713
+ error_data = await response.json()
2714
+ error_message = error_data.get('error', 'No error message provided.')
2715
+ error_details = error_data.get('details', 'No additional error details provided.')
2716
+ except:
2717
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2718
+
2719
+ message = await self._get_async_error_message(response.status, action, error_message, error_details)
2720
+
2721
+ raise aiohttp.ClientError(message)
2722
+
2723
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2724
+ """
2725
+ Generate a standardized error message based on HTTP status code for async operations.
2726
+
2727
+ Args:
2728
+ status_code (int): The HTTP status code from the response
2729
+ action (str): Description of the action that was being performed
2730
+ error_message (str): The error message from the API response
2731
+ error_details (str): Additional error details from the API response
2732
+
2733
+ Returns:
2734
+ str: A formatted error message
2735
+ """
2736
+ return self._get_error_message(status_code, action, error_message, error_details)
2737
+
2738
+ async def crawl_url_and_watch(
2739
+ self,
2740
+ url: str,
2741
+ params: Optional[CrawlParams] = None,
2742
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2743
+ """
2744
+ Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2745
+
2746
+ Args:
2747
+ url (str): Target URL to start crawling from
2748
+ params (Optional[CrawlParams]): See CrawlParams model for configuration:
2749
+ URL Discovery:
2750
+ * includePaths - Patterns of URLs to include
2751
+ * excludePaths - Patterns of URLs to exclude
2752
+ * maxDepth - Maximum crawl depth
2753
+ * maxDiscoveryDepth - Maximum depth for finding new URLs
2754
+ * limit - Maximum pages to crawl
2755
+
2756
+ Link Following:
2757
+ * allowBackwardLinks - Follow parent directory links
2758
+ * allowExternalLinks - Follow external domain links
2759
+ * ignoreSitemap - Skip sitemap.xml processing
2760
+
2761
+ Advanced:
2762
+ * scrapeOptions - Page scraping configuration
2763
+ * webhook - Notification webhook settings
2764
+ * deduplicateSimilarURLs - Remove similar URLs
2765
+ * ignoreQueryParameters - Ignore URL parameters
2766
+ * regexOnFullURL - Apply regex to full URLs
2767
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2768
+
2769
+ Returns:
2770
+ AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2771
+
2772
+ Raises:
2773
+ Exception: If crawl job fails to start
2774
+ """
2775
+ crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2776
+ if crawl_response.get('success') and 'id' in crawl_response:
2777
+ return AsyncCrawlWatcher(crawl_response['id'], self)
2778
+ else:
2779
+ raise Exception("Crawl job failed to start")
2780
+
2781
+ async def batch_scrape_urls_and_watch(
2782
+ self,
2783
+ urls: List[str],
2784
+ params: Optional[ScrapeParams] = None,
2785
+ idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2786
+ """
2787
+ Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2788
+
2789
+ Args:
2790
+ urls (List[str]): List of URLs to scrape
2791
+ params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2792
+
2793
+ Content Options:
2794
+ * formats - Content formats to retrieve
2795
+ * includeTags - HTML tags to include
2796
+ * excludeTags - HTML tags to exclude
2797
+ * onlyMainContent - Extract main content only
2798
+
2799
+ Request Options:
2800
+ * headers - Custom HTTP headers
2801
+ * timeout - Request timeout (ms)
2802
+ * mobile - Use mobile user agent
2803
+ * proxy - Proxy type
2804
+
2805
+ Extraction Options:
2806
+ * extract - Content extraction config
2807
+ * jsonOptions - JSON extraction config
2808
+ * actions - Actions to perform
2809
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2810
+
2811
+ Returns:
2812
+ AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2813
+
2814
+ Raises:
2815
+ Exception: If batch scrape job fails to start
2816
+ """
2817
+ batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2818
+ if batch_response.get('success') and 'id' in batch_response:
2819
+ return AsyncCrawlWatcher(batch_response['id'], self)
2820
+ else:
2821
+ raise Exception("Batch scrape job failed to start")
2822
+
2823
+ async def scrape_url(
2824
+ self,
2825
+ url: str,
2826
+ *,
2827
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2828
+ include_tags: Optional[List[str]] = None,
2829
+ exclude_tags: Optional[List[str]] = None,
2830
+ only_main_content: Optional[bool] = None,
2831
+ wait_for: Optional[int] = None,
2832
+ timeout: Optional[int] = None,
2833
+ location: Optional[LocationConfig] = None,
2834
+ mobile: Optional[bool] = None,
2835
+ skip_tls_verification: Optional[bool] = None,
2836
+ remove_base64_images: Optional[bool] = None,
2837
+ block_ads: Optional[bool] = None,
2838
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2839
+ extract: Optional[JsonConfig] = None,
2840
+ json_options: Optional[JsonConfig] = None,
2841
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2842
+ **kwargs) -> ScrapeResponse[Any]:
2843
+ """
2844
+ Scrape a single URL asynchronously.
2845
+
2846
+ Args:
2847
+ url (str): Target URL to scrape
2848
+ formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2849
+ include_tags (Optional[List[str]]): HTML tags to include
2850
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2851
+ only_main_content (Optional[bool]): Extract main content only
2852
+ wait_for (Optional[int]): Wait for a specific element to appear
2853
+ timeout (Optional[int]): Request timeout (ms)
2854
+ location (Optional[LocationConfig]): Location configuration
2855
+ mobile (Optional[bool]): Use mobile user agent
2856
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2857
+ remove_base64_images (Optional[bool]): Remove base64 images
2858
+ block_ads (Optional[bool]): Block ads
2859
+ proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2860
+ extract (Optional[JsonConfig]): Content extraction settings
2861
+ json_options (Optional[JsonConfig]): JSON extraction settings
2862
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2863
+ **kwargs: Additional parameters to pass to the API
2864
+
2865
+ Returns:
2866
+ ScrapeResponse with:
2867
+ * success - Whether scrape was successful
2868
+ * markdown - Markdown content if requested
2869
+ * html - HTML content if requested
2870
+ * rawHtml - Raw HTML content if requested
2871
+ * links - Extracted links if requested
2872
+ * screenshot - Screenshot if requested
2873
+ * extract - Extracted data if requested
2874
+ * json - JSON data if requested
2875
+ * error - Error message if scrape failed
2876
+
2877
+ Raises:
2878
+ Exception: If scraping fails
2879
+ """
2880
+ # Validate any additional kwargs
2881
+ self._validate_kwargs(kwargs, "scrape_url")
2882
+
2883
+ headers = self._prepare_headers()
2884
+
2885
+ # Build scrape parameters
2886
+ scrape_params = {
2887
+ 'url': url,
2888
+ 'origin': f"python-sdk@{version}"
2889
+ }
2890
+
2891
+ # Add optional parameters if provided and not None
2892
+ if formats:
2893
+ scrape_params['formats'] = formats
2894
+ if include_tags:
2895
+ scrape_params['includeTags'] = include_tags
2896
+ if exclude_tags:
2897
+ scrape_params['excludeTags'] = exclude_tags
2898
+ if only_main_content is not None:
2899
+ scrape_params['onlyMainContent'] = only_main_content
2900
+ if wait_for:
2901
+ scrape_params['waitFor'] = wait_for
2902
+ if timeout:
2903
+ scrape_params['timeout'] = timeout
2904
+ if location:
2905
+ scrape_params['location'] = location.dict(exclude_none=True)
2906
+ if mobile is not None:
2907
+ scrape_params['mobile'] = mobile
2908
+ if skip_tls_verification is not None:
2909
+ scrape_params['skipTlsVerification'] = skip_tls_verification
2910
+ if remove_base64_images is not None:
2911
+ scrape_params['removeBase64Images'] = remove_base64_images
2912
+ if block_ads is not None:
2913
+ scrape_params['blockAds'] = block_ads
2914
+ if proxy:
2915
+ scrape_params['proxy'] = proxy
2916
+ if extract is not None:
2917
+ extract = self._ensure_schema_dict(extract)
2918
+ if isinstance(extract, dict) and "schema" in extract:
2919
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
2920
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
2921
+ if json_options is not None:
2922
+ json_options = self._ensure_schema_dict(json_options)
2923
+ if isinstance(json_options, dict) and "schema" in json_options:
2924
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
2925
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
2926
+ if actions:
2927
+ scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
2928
+
2929
+ if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
2930
+ scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
2931
+ if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
2932
+ scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
2933
+
2934
+ # Make async request
2935
+ endpoint = f'/v1/scrape'
2936
+ response = await self._async_post_request(
2937
+ f'{self.api_url}{endpoint}',
2938
+ scrape_params,
2939
+ headers
2940
+ )
2941
+
2942
+ if response.get('success') and 'data' in response:
2943
+ return ScrapeResponse(**response['data'])
2944
+ elif "error" in response:
2945
+ raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2946
+ else:
2947
+ # Use the response content directly if possible, otherwise a generic message
2948
+ error_content = response.get('error', str(response))
2949
+ raise Exception(f'Failed to scrape URL. Error: {error_content}')
2950
+
2951
+ async def batch_scrape_urls(
2952
+ self,
2953
+ urls: List[str],
2954
+ *,
2955
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2956
+ headers: Optional[Dict[str, str]] = None,
2957
+ include_tags: Optional[List[str]] = None,
2958
+ exclude_tags: Optional[List[str]] = None,
2959
+ only_main_content: Optional[bool] = None,
2960
+ wait_for: Optional[int] = None,
2961
+ timeout: Optional[int] = None,
2962
+ location: Optional[LocationConfig] = None,
2963
+ mobile: Optional[bool] = None,
2964
+ skip_tls_verification: Optional[bool] = None,
2965
+ remove_base64_images: Optional[bool] = None,
2966
+ block_ads: Optional[bool] = None,
2967
+ proxy: Optional[Literal["basic", "stealth"]] = None,
2968
+ extract: Optional[JsonConfig] = None,
2969
+ json_options: Optional[JsonConfig] = None,
2970
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2971
+ agent: Optional[AgentOptions] = None,
2972
+ poll_interval: Optional[int] = 2,
2973
+ idempotency_key: Optional[str] = None,
2974
+ **kwargs
2975
+ ) -> BatchScrapeStatusResponse:
2976
+ """
2977
+ Asynchronously scrape multiple URLs and monitor until completion.
2978
+
2979
+ Args:
2980
+ urls (List[str]): URLs to scrape
2981
+ formats (Optional[List[Literal]]): Content formats to retrieve
2982
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
2983
+ include_tags (Optional[List[str]]): HTML tags to include
2984
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
2985
+ only_main_content (Optional[bool]): Extract main content only
2986
+ wait_for (Optional[int]): Wait time in milliseconds
2987
+ timeout (Optional[int]): Request timeout in milliseconds
2988
+ location (Optional[LocationConfig]): Location configuration
2989
+ mobile (Optional[bool]): Use mobile user agent
2990
+ skip_tls_verification (Optional[bool]): Skip TLS verification
2991
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
2992
+ block_ads (Optional[bool]): Block advertisements
2993
+ proxy (Optional[Literal]): Proxy type to use
2994
+ extract (Optional[JsonConfig]): Content extraction config
2995
+ json_options (Optional[JsonConfig]): JSON extraction config
2996
+ actions (Optional[List[Union]]): Actions to perform
2997
+ agent (Optional[AgentOptions]): Agent configuration
2998
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
2999
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3000
+ **kwargs: Additional parameters to pass to the API
3001
+
3002
+ Returns:
3003
+ BatchScrapeStatusResponse with:
3004
+ * Scraping status and progress
3005
+ * Scraped content for each URL
3006
+ * Success/error information
3007
+
3008
+ Raises:
3009
+ Exception: If batch scrape fails
3010
+ """
3011
+ # Validate any additional kwargs
3012
+ self._validate_kwargs(kwargs, "batch_scrape_urls")
3013
+
3014
+ scrape_params = {}
3015
+
3016
+ # Add individual parameters
3017
+ if formats is not None:
3018
+ scrape_params['formats'] = formats
3019
+ if headers is not None:
3020
+ scrape_params['headers'] = headers
3021
+ if include_tags is not None:
3022
+ scrape_params['includeTags'] = include_tags
3023
+ if exclude_tags is not None:
3024
+ scrape_params['excludeTags'] = exclude_tags
3025
+ if only_main_content is not None:
3026
+ scrape_params['onlyMainContent'] = only_main_content
3027
+ if wait_for is not None:
3028
+ scrape_params['waitFor'] = wait_for
3029
+ if timeout is not None:
3030
+ scrape_params['timeout'] = timeout
3031
+ if location is not None:
3032
+ scrape_params['location'] = location.dict(exclude_none=True)
3033
+ if mobile is not None:
3034
+ scrape_params['mobile'] = mobile
3035
+ if skip_tls_verification is not None:
3036
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3037
+ if remove_base64_images is not None:
3038
+ scrape_params['removeBase64Images'] = remove_base64_images
3039
+ if block_ads is not None:
3040
+ scrape_params['blockAds'] = block_ads
3041
+ if proxy is not None:
3042
+ scrape_params['proxy'] = proxy
3043
+ if extract is not None:
3044
+ extract = self._ensure_schema_dict(extract)
3045
+ if isinstance(extract, dict) and "schema" in extract:
3046
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3047
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3048
+ if json_options is not None:
3049
+ json_options = self._ensure_schema_dict(json_options)
3050
+ if isinstance(json_options, dict) and "schema" in json_options:
3051
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3052
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3053
+ if actions is not None:
3054
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3055
+ if agent is not None:
3056
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3057
+
3058
+ # Add any additional kwargs
3059
+ scrape_params.update(kwargs)
3060
+
3061
+ # Create final params object
3062
+ final_params = ScrapeParams(**scrape_params)
3063
+ params_dict = final_params.dict(exclude_none=True)
3064
+ params_dict['urls'] = urls
3065
+ params_dict['origin'] = f"python-sdk@{version}"
3066
+
3067
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3068
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3069
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3070
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3071
+
3072
+ # Make request
3073
+ headers = self._prepare_headers(idempotency_key)
3074
+ response = await self._async_post_request(
3075
+ f'{self.api_url}/v1/batch/scrape',
3076
+ params_dict,
3077
+ headers
3078
+ )
3079
+
3080
+ if response.get('success'):
3081
+ try:
3082
+ id = response.get('id')
3083
+ except:
3084
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3085
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3086
+ else:
3087
+ self._handle_error(response, 'start batch scrape job')
3088
+
3089
+
3090
+ async def async_batch_scrape_urls(
3091
+ self,
3092
+ urls: List[str],
3093
+ *,
3094
+ formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3095
+ headers: Optional[Dict[str, str]] = None,
3096
+ include_tags: Optional[List[str]] = None,
3097
+ exclude_tags: Optional[List[str]] = None,
3098
+ only_main_content: Optional[bool] = None,
3099
+ wait_for: Optional[int] = None,
3100
+ timeout: Optional[int] = None,
3101
+ location: Optional[LocationConfig] = None,
3102
+ mobile: Optional[bool] = None,
3103
+ skip_tls_verification: Optional[bool] = None,
3104
+ remove_base64_images: Optional[bool] = None,
3105
+ block_ads: Optional[bool] = None,
3106
+ proxy: Optional[Literal["basic", "stealth"]] = None,
3107
+ extract: Optional[JsonConfig] = None,
3108
+ json_options: Optional[JsonConfig] = None,
3109
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3110
+ agent: Optional[AgentOptions] = None,
3111
+ idempotency_key: Optional[str] = None,
3112
+ **kwargs
3113
+ ) -> BatchScrapeResponse:
3114
+ """
3115
+ Initiate a batch scrape job asynchronously.
3116
+
3117
+ Args:
3118
+ urls (List[str]): URLs to scrape
3119
+ formats (Optional[List[Literal]]): Content formats to retrieve
3120
+ headers (Optional[Dict[str, str]]): Custom HTTP headers
3121
+ include_tags (Optional[List[str]]): HTML tags to include
3122
+ exclude_tags (Optional[List[str]]): HTML tags to exclude
3123
+ only_main_content (Optional[bool]): Extract main content only
3124
+ wait_for (Optional[int]): Wait time in milliseconds
3125
+ timeout (Optional[int]): Request timeout in milliseconds
3126
+ location (Optional[LocationConfig]): Location configuration
3127
+ mobile (Optional[bool]): Use mobile user agent
3128
+ skip_tls_verification (Optional[bool]): Skip TLS verification
3129
+ remove_base64_images (Optional[bool]): Remove base64 encoded images
3130
+ block_ads (Optional[bool]): Block advertisements
3131
+ proxy (Optional[Literal]): Proxy type to use
3132
+ extract (Optional[JsonConfig]): Content extraction config
3133
+ json_options (Optional[JsonConfig]): JSON extraction config
3134
+ actions (Optional[List[Union]]): Actions to perform
3135
+ agent (Optional[AgentOptions]): Agent configuration
3136
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3137
+ **kwargs: Additional parameters to pass to the API
3138
+
3139
+ Returns:
3140
+ BatchScrapeResponse with:
3141
+ * success - Whether job started successfully
3142
+ * id - Unique identifier for the job
3143
+ * url - Status check URL
3144
+ * error - Error message if start failed
3145
+
3146
+ Raises:
3147
+ Exception: If job initiation fails
3148
+ """
3149
+ # Validate any additional kwargs
3150
+ self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3151
+
3152
+ scrape_params = {}
3153
+
3154
+ # Add individual parameters
3155
+ if formats is not None:
3156
+ scrape_params['formats'] = formats
3157
+ if headers is not None:
3158
+ scrape_params['headers'] = headers
3159
+ if include_tags is not None:
3160
+ scrape_params['includeTags'] = include_tags
3161
+ if exclude_tags is not None:
3162
+ scrape_params['excludeTags'] = exclude_tags
3163
+ if only_main_content is not None:
3164
+ scrape_params['onlyMainContent'] = only_main_content
3165
+ if wait_for is not None:
3166
+ scrape_params['waitFor'] = wait_for
3167
+ if timeout is not None:
3168
+ scrape_params['timeout'] = timeout
3169
+ if location is not None:
3170
+ scrape_params['location'] = location.dict(exclude_none=True)
3171
+ if mobile is not None:
3172
+ scrape_params['mobile'] = mobile
3173
+ if skip_tls_verification is not None:
3174
+ scrape_params['skipTlsVerification'] = skip_tls_verification
3175
+ if remove_base64_images is not None:
3176
+ scrape_params['removeBase64Images'] = remove_base64_images
3177
+ if block_ads is not None:
3178
+ scrape_params['blockAds'] = block_ads
3179
+ if proxy is not None:
3180
+ scrape_params['proxy'] = proxy
3181
+ if extract is not None:
3182
+ extract = self._ensure_schema_dict(extract)
3183
+ if isinstance(extract, dict) and "schema" in extract:
3184
+ extract["schema"] = self._ensure_schema_dict(extract["schema"])
3185
+ scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3186
+ if json_options is not None:
3187
+ json_options = self._ensure_schema_dict(json_options)
3188
+ if isinstance(json_options, dict) and "schema" in json_options:
3189
+ json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3190
+ scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3191
+ if actions is not None:
3192
+ scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3193
+ if agent is not None:
3194
+ scrape_params['agent'] = agent.dict(exclude_none=True)
3195
+
3196
+ # Add any additional kwargs
3197
+ scrape_params.update(kwargs)
3198
+
3199
+ # Create final params object
3200
+ final_params = ScrapeParams(**scrape_params)
3201
+ params_dict = final_params.dict(exclude_none=True)
3202
+ params_dict['urls'] = urls
3203
+ params_dict['origin'] = f"python-sdk@{version}"
3204
+
3205
+ if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3206
+ params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3207
+ if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3208
+ params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3209
+
3210
+ # Make request
3211
+ headers = self._prepare_headers(idempotency_key)
3212
+ response = await self._async_post_request(
3213
+ f'{self.api_url}/v1/batch/scrape',
3214
+ params_dict,
3215
+ headers
3216
+ )
3217
+
3218
+ if response.get('status_code') == 200:
3219
+ try:
3220
+ return BatchScrapeResponse(**response.json())
3221
+ except:
3222
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3223
+ else:
3224
+ self._handle_error(response, 'start batch scrape job')
3225
+
3226
+ async def crawl_url(
3227
+ self,
3228
+ url: str,
3229
+ *,
3230
+ include_paths: Optional[List[str]] = None,
3231
+ exclude_paths: Optional[List[str]] = None,
3232
+ max_depth: Optional[int] = None,
3233
+ max_discovery_depth: Optional[int] = None,
3234
+ limit: Optional[int] = None,
3235
+ allow_backward_links: Optional[bool] = None,
3236
+ allow_external_links: Optional[bool] = None,
3237
+ ignore_sitemap: Optional[bool] = None,
3238
+ scrape_options: Optional[ScrapeOptions] = None,
3239
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3240
+ deduplicate_similar_urls: Optional[bool] = None,
3241
+ ignore_query_parameters: Optional[bool] = None,
3242
+ regex_on_full_url: Optional[bool] = None,
3243
+ poll_interval: Optional[int] = 2,
3244
+ idempotency_key: Optional[str] = None,
3245
+ **kwargs
3246
+ ) -> CrawlStatusResponse:
3247
+ """
3248
+ Crawl a website starting from a URL.
3249
+
3250
+ Args:
3251
+ url (str): Target URL to start crawling from
3252
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3253
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3254
+ max_depth (Optional[int]): Maximum crawl depth
3255
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3256
+ limit (Optional[int]): Maximum pages to crawl
3257
+ allow_backward_links (Optional[bool]): Follow parent directory links
3258
+ allow_external_links (Optional[bool]): Follow external domain links
3259
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3260
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3261
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3262
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3263
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3264
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3265
+ poll_interval (Optional[int]): Seconds between status checks (default: 2)
3266
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3267
+ **kwargs: Additional parameters to pass to the API
3268
+
3269
+ Returns:
3270
+ CrawlStatusResponse with:
3271
+ * Crawling status and progress
3272
+ * Crawled page contents
3273
+ * Success/error information
3274
+
3275
+ Raises:
3276
+ Exception: If crawl fails
3277
+ """
3278
+ # Validate any additional kwargs
3279
+ self._validate_kwargs(kwargs, "crawl_url")
3280
+
3281
+ crawl_params = {}
3282
+
3283
+ # Add individual parameters
3284
+ if include_paths is not None:
3285
+ crawl_params['includePaths'] = include_paths
3286
+ if exclude_paths is not None:
3287
+ crawl_params['excludePaths'] = exclude_paths
3288
+ if max_depth is not None:
3289
+ crawl_params['maxDepth'] = max_depth
3290
+ if max_discovery_depth is not None:
3291
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3292
+ if limit is not None:
3293
+ crawl_params['limit'] = limit
3294
+ if allow_backward_links is not None:
3295
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3296
+ if allow_external_links is not None:
3297
+ crawl_params['allowExternalLinks'] = allow_external_links
3298
+ if ignore_sitemap is not None:
3299
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3300
+ if scrape_options is not None:
3301
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3302
+ if webhook is not None:
3303
+ crawl_params['webhook'] = webhook
3304
+ if deduplicate_similar_urls is not None:
3305
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3306
+ if ignore_query_parameters is not None:
3307
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3308
+ if regex_on_full_url is not None:
3309
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3310
+
3311
+ # Add any additional kwargs
3312
+ crawl_params.update(kwargs)
3313
+
3314
+ # Create final params object
3315
+ final_params = CrawlParams(**crawl_params)
3316
+ params_dict = final_params.dict(exclude_none=True)
3317
+ params_dict['url'] = url
3318
+ params_dict['origin'] = f"python-sdk@{version}"
3319
+ # Make request
3320
+ headers = self._prepare_headers(idempotency_key)
3321
+ response = await self._async_post_request(
3322
+ f'{self.api_url}/v1/crawl', params_dict, headers)
3323
+
3324
+ if response.get('success'):
3325
+ try:
3326
+ id = response.get('id')
3327
+ except:
3328
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3329
+ return await self._async_monitor_job_status(id, headers, poll_interval)
3330
+ else:
3331
+ self._handle_error(response, 'start crawl job')
3332
+
3333
+
3334
+ async def async_crawl_url(
3335
+ self,
3336
+ url: str,
3337
+ *,
3338
+ include_paths: Optional[List[str]] = None,
3339
+ exclude_paths: Optional[List[str]] = None,
3340
+ max_depth: Optional[int] = None,
3341
+ max_discovery_depth: Optional[int] = None,
3342
+ limit: Optional[int] = None,
3343
+ allow_backward_links: Optional[bool] = None,
3344
+ allow_external_links: Optional[bool] = None,
3345
+ ignore_sitemap: Optional[bool] = None,
3346
+ scrape_options: Optional[ScrapeOptions] = None,
3347
+ webhook: Optional[Union[str, WebhookConfig]] = None,
3348
+ deduplicate_similar_urls: Optional[bool] = None,
3349
+ ignore_query_parameters: Optional[bool] = None,
3350
+ regex_on_full_url: Optional[bool] = None,
3351
+ poll_interval: Optional[int] = 2,
3352
+ idempotency_key: Optional[str] = None,
3353
+ **kwargs
3354
+ ) -> CrawlResponse:
3355
+ """
3356
+ Start an asynchronous crawl job.
3357
+
3358
+ Args:
3359
+ url (str): Target URL to start crawling from
3360
+ include_paths (Optional[List[str]]): Patterns of URLs to include
3361
+ exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3362
+ max_depth (Optional[int]): Maximum crawl depth
3363
+ max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3364
+ limit (Optional[int]): Maximum pages to crawl
3365
+ allow_backward_links (Optional[bool]): Follow parent directory links
3366
+ allow_external_links (Optional[bool]): Follow external domain links
3367
+ ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3368
+ scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3369
+ webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3370
+ deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3371
+ ignore_query_parameters (Optional[bool]): Ignore URL parameters
3372
+ regex_on_full_url (Optional[bool]): Apply regex to full URLs
3373
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3374
+ **kwargs: Additional parameters to pass to the API
3375
+
3376
+ Returns:
3377
+ CrawlResponse with:
3378
+ * success - Whether crawl started successfully
3379
+ * id - Unique identifier for the crawl job
3380
+ * url - Status check URL for the crawl
3381
+ * error - Error message if start failed
3382
+
3383
+ Raises:
3384
+ Exception: If crawl initiation fails
3385
+ """
3386
+ crawl_params = {}
3387
+
3388
+ # Add individual parameters
3389
+ if include_paths is not None:
3390
+ crawl_params['includePaths'] = include_paths
3391
+ if exclude_paths is not None:
3392
+ crawl_params['excludePaths'] = exclude_paths
3393
+ if max_depth is not None:
3394
+ crawl_params['maxDepth'] = max_depth
3395
+ if max_discovery_depth is not None:
3396
+ crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3397
+ if limit is not None:
3398
+ crawl_params['limit'] = limit
3399
+ if allow_backward_links is not None:
3400
+ crawl_params['allowBackwardLinks'] = allow_backward_links
3401
+ if allow_external_links is not None:
3402
+ crawl_params['allowExternalLinks'] = allow_external_links
3403
+ if ignore_sitemap is not None:
3404
+ crawl_params['ignoreSitemap'] = ignore_sitemap
3405
+ if scrape_options is not None:
3406
+ crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3407
+ if webhook is not None:
3408
+ crawl_params['webhook'] = webhook
3409
+ if deduplicate_similar_urls is not None:
3410
+ crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3411
+ if ignore_query_parameters is not None:
3412
+ crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3413
+ if regex_on_full_url is not None:
3414
+ crawl_params['regexOnFullURL'] = regex_on_full_url
3415
+
3416
+ # Add any additional kwargs
3417
+ crawl_params.update(kwargs)
3418
+
3419
+ # Create final params object
3420
+ final_params = CrawlParams(**crawl_params)
3421
+ params_dict = final_params.dict(exclude_none=True)
3422
+ params_dict['url'] = url
3423
+ params_dict['origin'] = f"python-sdk@{version}"
3424
+
3425
+ # Make request
3426
+ headers = self._prepare_headers(idempotency_key)
3427
+ response = await self._async_post_request(
3428
+ f'{self.api_url}/v1/crawl',
3429
+ params_dict,
3430
+ headers
3431
+ )
3432
+
3433
+ if response.get('success'):
3434
+ try:
3435
+ return CrawlResponse(**response)
3436
+ except:
3437
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
3438
+ else:
3439
+ self._handle_error(response, 'start crawl job')
3440
+
3441
+ async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3442
+ """
3443
+ Check the status and results of an asynchronous crawl job.
3444
+
3445
+ Args:
3446
+ id (str): Unique identifier for the crawl job
3447
+
3448
+ Returns:
3449
+ CrawlStatusResponse containing:
3450
+ Status Information:
3451
+ * status - Current state (scraping/completed/failed/cancelled)
3452
+ * completed - Number of pages crawled
3453
+ * total - Total pages to crawl
3454
+ * creditsUsed - API credits consumed
3455
+ * expiresAt - Data expiration timestamp
3456
+
3457
+ Results:
3458
+ * data - List of crawled documents
3459
+ * next - URL for next page of results (if paginated)
3460
+ * success - Whether status check succeeded
3461
+ * error - Error message if failed
3462
+
3463
+ Raises:
3464
+ Exception: If status check fails
3465
+ """
3466
+ headers = self._prepare_headers()
3467
+ endpoint = f'/v1/crawl/{id}'
3468
+
3469
+ status_data = await self._async_get_request(
3470
+ f'{self.api_url}{endpoint}',
3471
+ headers
3472
+ )
3473
+
3474
+ if status_data.get('status') == 'completed':
3475
+ if 'data' in status_data:
3476
+ data = status_data['data']
3477
+ while 'next' in status_data:
3478
+ if len(status_data['data']) == 0:
3479
+ break
3480
+ next_url = status_data.get('next')
3481
+ if not next_url:
3482
+ logger.warning("Expected 'next' URL is missing.")
3483
+ break
3484
+ next_data = await self._async_get_request(next_url, headers)
3485
+ data.extend(next_data.get('data', []))
3486
+ status_data = next_data
3487
+ status_data['data'] = data
3488
+ # Create CrawlStatusResponse object from status data
3489
+ response = CrawlStatusResponse(
3490
+ status=status_data.get('status'),
3491
+ total=status_data.get('total'),
3492
+ completed=status_data.get('completed'),
3493
+ creditsUsed=status_data.get('creditsUsed'),
3494
+ expiresAt=status_data.get('expiresAt'),
3495
+ data=status_data.get('data'),
3496
+ success=False if 'error' in status_data else True
3497
+ )
3498
+
3499
+ if 'error' in status_data:
3500
+ response.error = status_data.get('error')
3501
+
3502
+ if 'next' in status_data:
3503
+ response.next = status_data.get('next')
3504
+
3505
+ return response
3506
+
3507
+ async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3508
+ """
3509
+ Monitor the status of an asynchronous job until completion.
3510
+
3511
+ Args:
3512
+ id (str): The ID of the job to monitor
3513
+ headers (Dict[str, str]): Headers to include in status check requests
3514
+ poll_interval (int): Seconds between status checks (default: 2)
3515
+
3516
+ Returns:
3517
+ CrawlStatusResponse: The job results if completed successfully
3518
+
3519
+ Raises:
3520
+ Exception: If the job fails or an error occurs during status checks
3521
+ """
3522
+ while True:
3523
+ status_data = await self._async_get_request(
3524
+ f'{self.api_url}/v1/crawl/{id}',
3525
+ headers
3526
+ )
3527
+
3528
+ if status_data.get('status') == 'completed':
3529
+ if 'data' in status_data:
3530
+ data = status_data['data']
3531
+ while 'next' in status_data:
3532
+ if len(status_data['data']) == 0:
3533
+ break
3534
+ next_url = status_data.get('next')
3535
+ if not next_url:
3536
+ logger.warning("Expected 'next' URL is missing.")
3537
+ break
3538
+ next_data = await self._async_get_request(next_url, headers)
3539
+ data.extend(next_data.get('data', []))
3540
+ status_data = next_data
3541
+ status_data['data'] = data
3542
+ return CrawlStatusResponse(**status_data)
3543
+ else:
3544
+ raise Exception('Job completed but no data was returned')
3545
+ elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3546
+ await asyncio.sleep(max(poll_interval, 2))
3547
+ else:
3548
+ raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3549
+
3550
+ async def map_url(
3551
+ self,
3552
+ url: str,
3553
+ *,
3554
+ search: Optional[str] = None,
3555
+ ignore_sitemap: Optional[bool] = None,
3556
+ include_subdomains: Optional[bool] = None,
3557
+ sitemap_only: Optional[bool] = None,
3558
+ limit: Optional[int] = None,
3559
+ timeout: Optional[int] = None,
3560
+ params: Optional[MapParams] = None) -> MapResponse:
3561
+ """
3562
+ Asynchronously map and discover links from a URL.
3563
+
3564
+ Args:
3565
+ url (str): Target URL to map
3566
+ params (Optional[MapParams]): See MapParams model:
3567
+ Discovery Options:
3568
+ * search - Filter pattern for URLs
3569
+ * ignoreSitemap - Skip sitemap.xml
3570
+ * includeSubdomains - Include subdomain links
3571
+ * sitemapOnly - Only use sitemap.xml
3572
+
3573
+ Limits:
3574
+ * limit - Max URLs to return
3575
+ * timeout - Request timeout (ms)
3576
+
3577
+ Returns:
3578
+ MapResponse with:
3579
+ * Discovered URLs
3580
+ * Success/error status
3581
+
3582
+ Raises:
3583
+ Exception: If mapping fails
3584
+ """
3585
+ map_params = {}
3586
+ if params:
3587
+ map_params.update(params.dict(exclude_none=True))
3588
+
3589
+ # Add individual parameters
3590
+ if search is not None:
3591
+ map_params['search'] = search
3592
+ if ignore_sitemap is not None:
3593
+ map_params['ignoreSitemap'] = ignore_sitemap
3594
+ if include_subdomains is not None:
3595
+ map_params['includeSubdomains'] = include_subdomains
3596
+ if sitemap_only is not None:
3597
+ map_params['sitemapOnly'] = sitemap_only
3598
+ if limit is not None:
3599
+ map_params['limit'] = limit
3600
+ if timeout is not None:
3601
+ map_params['timeout'] = timeout
3602
+
3603
+ # Create final params object
3604
+ final_params = MapParams(**map_params)
3605
+ params_dict = final_params.dict(exclude_none=True)
3606
+ params_dict['url'] = url
3607
+ params_dict['origin'] = f"python-sdk@{version}"
3608
+
3609
+ # Make request
3610
+ endpoint = f'/v1/map'
3611
+ response = await self._async_post_request(
3612
+ f'{self.api_url}{endpoint}',
3613
+ params_dict,
3614
+ headers={"Authorization": f"Bearer {self.api_key}"}
3615
+ )
3616
+
3617
+ if response.get('success') and 'links' in response:
3618
+ return MapResponse(**response)
3619
+ elif 'error' in response:
3620
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
3621
+ else:
3622
+ raise Exception(f'Failed to map URL. Error: {response}')
3623
+
3624
+ async def extract(
3625
+ self,
3626
+ urls: Optional[List[str]] = None,
3627
+ *,
3628
+ prompt: Optional[str] = None,
3629
+ schema: Optional[Any] = None,
3630
+ system_prompt: Optional[str] = None,
3631
+ allow_external_links: Optional[bool] = False,
3632
+ enable_web_search: Optional[bool] = False,
3633
+ show_sources: Optional[bool] = False,
3634
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3635
+
3636
+ """
3637
+ Asynchronously extract structured information from URLs.
3638
+
3639
+ Args:
3640
+ urls (Optional[List[str]]): URLs to extract from
3641
+ prompt (Optional[str]): Custom extraction prompt
3642
+ schema (Optional[Any]): JSON schema/Pydantic model
3643
+ system_prompt (Optional[str]): System context
3644
+ allow_external_links (Optional[bool]): Follow external links
3645
+ enable_web_search (Optional[bool]): Enable web search
3646
+ show_sources (Optional[bool]): Include source URLs
3647
+ agent (Optional[Dict[str, Any]]): Agent configuration
3648
+
3649
+ Returns:
3650
+ ExtractResponse with:
3651
+ * Structured data matching schema
3652
+ * Source information if requested
3653
+ * Success/error status
3654
+
3655
+ Raises:
3656
+ ValueError: If prompt/schema missing or extraction fails
3657
+ """
3658
+ headers = self._prepare_headers()
3659
+
3660
+ if not prompt and not schema:
3661
+ raise ValueError("Either prompt or schema is required")
3662
+
3663
+ if not urls and not prompt:
3664
+ raise ValueError("Either urls or prompt is required")
3665
+
3666
+ if schema:
3667
+ schema = self._ensure_schema_dict(schema)
3668
+
3669
+ request_data = {
3670
+ 'urls': urls or [],
3671
+ 'allowExternalLinks': allow_external_links,
3672
+ 'enableWebSearch': enable_web_search,
3673
+ 'showSources': show_sources,
3674
+ 'schema': schema,
3675
+ 'origin': f'python-sdk@{get_version()}'
3676
+ }
3677
+
3678
+ # Only add prompt and systemPrompt if they exist
3679
+ if prompt:
3680
+ request_data['prompt'] = prompt
3681
+ if system_prompt:
3682
+ request_data['systemPrompt'] = system_prompt
3683
+
3684
+ if agent:
3685
+ request_data['agent'] = agent
3686
+
3687
+ response = await self._async_post_request(
3688
+ f'{self.api_url}/v1/extract',
3689
+ request_data,
3690
+ headers
3691
+ )
3692
+
3693
+ if response.get('success'):
3694
+ job_id = response.get('id')
3695
+ if not job_id:
3696
+ raise Exception('Job ID not returned from extract request.')
3697
+
3698
+ while True:
3699
+ status_data = await self._async_get_request(
3700
+ f'{self.api_url}/v1/extract/{job_id}',
3701
+ headers
3702
+ )
3703
+
3704
+ if status_data['status'] == 'completed':
3705
+ return ExtractResponse(**status_data)
3706
+ elif status_data['status'] in ['failed', 'cancelled']:
3707
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3708
+
3709
+ await asyncio.sleep(2)
3710
+ else:
3711
+ raise Exception(f'Failed to extract. Error: {response.get("error")}')
3712
+
3713
+ async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3714
+ """
3715
+ Check the status of an asynchronous batch scrape job.
3716
+
3717
+ Args:
3718
+ id (str): The ID of the batch scrape job
3719
+
3720
+ Returns:
3721
+ BatchScrapeStatusResponse containing:
3722
+ Status Information:
3723
+ * status - Current state (scraping/completed/failed/cancelled)
3724
+ * completed - Number of URLs scraped
3725
+ * total - Total URLs to scrape
3726
+ * creditsUsed - API credits consumed
3727
+ * expiresAt - Data expiration timestamp
3728
+
3729
+ Results:
3730
+ * data - List of scraped documents
3731
+ * next - URL for next page of results (if paginated)
3732
+ * success - Whether status check succeeded
3733
+ * error - Error message if failed
3734
+
3735
+ Raises:
3736
+ Exception: If status check fails
3737
+ """
3738
+ headers = self._prepare_headers()
3739
+ endpoint = f'/v1/batch/scrape/{id}'
3740
+
3741
+ status_data = await self._async_get_request(
3742
+ f'{self.api_url}{endpoint}',
3743
+ headers
3744
+ )
3745
+
3746
+ if status_data['status'] == 'completed':
3747
+ if 'data' in status_data:
3748
+ data = status_data['data']
3749
+ while 'next' in status_data:
3750
+ if len(status_data['data']) == 0:
3751
+ break
3752
+ next_url = status_data.get('next')
3753
+ if not next_url:
3754
+ logger.warning("Expected 'next' URL is missing.")
3755
+ break
3756
+ next_data = await self._async_get_request(next_url, headers)
3757
+ data.extend(next_data.get('data', []))
3758
+ status_data = next_data
3759
+ status_data['data'] = data
3760
+
3761
+ response = BatchScrapeStatusResponse(
3762
+ status=status_data.get('status'),
3763
+ total=status_data.get('total'),
3764
+ completed=status_data.get('completed'),
3765
+ creditsUsed=status_data.get('creditsUsed'),
3766
+ expiresAt=status_data.get('expiresAt'),
3767
+ data=status_data.get('data')
3768
+ )
3769
+
3770
+ if 'error' in status_data:
3771
+ response['error'] = status_data['error']
3772
+
3773
+ if 'next' in status_data:
3774
+ response['next'] = status_data['next']
3775
+
3776
+ return {
3777
+ 'success': False if 'error' in status_data else True,
3778
+ **response
3779
+ }
3780
+
3781
+ async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3782
+ """
3783
+ Get information about errors from an asynchronous batch scrape job.
3784
+
3785
+ Args:
3786
+ id (str): The ID of the batch scrape job
3787
+
3788
+ Returns:
3789
+ CrawlErrorsResponse containing:
3790
+ errors (List[Dict[str, str]]): List of errors with fields:
3791
+ * id (str): Error ID
3792
+ * timestamp (str): When the error occurred
3793
+ * url (str): URL that caused the error
3794
+ * error (str): Error message
3795
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3796
+
3797
+ Raises:
3798
+ Exception: If error check fails
3799
+ """
3800
+ headers = self._prepare_headers()
3801
+ return await self._async_get_request(
3802
+ f'{self.api_url}/v1/batch/scrape/{id}/errors',
3803
+ headers
3804
+ )
3805
+
3806
+ async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3807
+ """
3808
+ Get information about errors from an asynchronous crawl job.
3809
+
3810
+ Args:
3811
+ id (str): The ID of the crawl job
3812
+
3813
+ Returns:
3814
+ CrawlErrorsResponse containing:
3815
+ * errors (List[Dict[str, str]]): List of errors with fields:
3816
+ - id (str): Error ID
3817
+ - timestamp (str): When the error occurred
3818
+ - url (str): URL that caused the error
3819
+ - error (str): Error message
3820
+ * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3821
+
3822
+ Raises:
3823
+ Exception: If error check fails
3824
+ """
3825
+ headers = self._prepare_headers()
3826
+ return await self._async_get_request(
3827
+ f'{self.api_url}/v1/crawl/{id}/errors',
3828
+ headers
3829
+ )
3830
+
3831
+ async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3832
+ """
3833
+ Cancel an asynchronous crawl job.
3834
+
3835
+ Args:
3836
+ id (str): The ID of the crawl job to cancel
3837
+
3838
+ Returns:
3839
+ Dict[str, Any] containing:
3840
+ * success (bool): Whether cancellation was successful
3841
+ * error (str, optional): Error message if cancellation failed
3842
+
3843
+ Raises:
3844
+ Exception: If cancellation fails
3845
+ """
3846
+ headers = self._prepare_headers()
3847
+ async with aiohttp.ClientSession() as session:
3848
+ async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3849
+ return await response.json()
3850
+
3851
+ async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3852
+ """
3853
+ Check the status of an asynchronous extraction job.
3854
+
3855
+ Args:
3856
+ job_id (str): The ID of the extraction job
3857
+
3858
+ Returns:
3859
+ ExtractResponse[Any] with:
3860
+ * success (bool): Whether request succeeded
3861
+ * data (Optional[Any]): Extracted data matching schema
3862
+ * error (Optional[str]): Error message if any
3863
+ * warning (Optional[str]): Warning message if any
3864
+ * sources (Optional[List[str]]): Source URLs if requested
3865
+
3866
+ Raises:
3867
+ ValueError: If status check fails
3868
+ """
3869
+ headers = self._prepare_headers()
3870
+ try:
3871
+ return await self._async_get_request(
3872
+ f'{self.api_url}/v1/extract/{job_id}',
3873
+ headers
3874
+ )
3875
+ except Exception as e:
3876
+ raise ValueError(str(e))
3877
+
3878
+ async def async_extract(
3879
+ self,
3880
+ urls: Optional[List[str]] = None,
3881
+ *,
3882
+ prompt: Optional[str] = None,
3883
+ schema: Optional[Any] = None,
3884
+ system_prompt: Optional[str] = None,
3885
+ allow_external_links: Optional[bool] = False,
3886
+ enable_web_search: Optional[bool] = False,
3887
+ show_sources: Optional[bool] = False,
3888
+ agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3889
+ """
3890
+ Initiate an asynchronous extraction job without waiting for completion.
3891
+
3892
+ Args:
3893
+ urls (Optional[List[str]]): URLs to extract from
3894
+ prompt (Optional[str]): Custom extraction prompt
3895
+ schema (Optional[Any]): JSON schema/Pydantic model
3896
+ system_prompt (Optional[str]): System context
3897
+ allow_external_links (Optional[bool]): Follow external links
3898
+ enable_web_search (Optional[bool]): Enable web search
3899
+ show_sources (Optional[bool]): Include source URLs
3900
+ agent (Optional[Dict[str, Any]]): Agent configuration
3901
+ idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3902
+
3903
+ Returns:
3904
+ ExtractResponse[Any] with:
3905
+ * success (bool): Whether request succeeded
3906
+ * data (Optional[Any]): Extracted data matching schema
3907
+ * error (Optional[str]): Error message if any
3908
+
3909
+ Raises:
3910
+ ValueError: If job initiation fails
3911
+ """
3912
+ headers = self._prepare_headers()
3913
+
3914
+ if not prompt and not schema:
3915
+ raise ValueError("Either prompt or schema is required")
3916
+
3917
+ if not urls and not prompt:
3918
+ raise ValueError("Either urls or prompt is required")
3919
+
3920
+ if schema:
3921
+ schema = self._ensure_schema_dict(schema)
3922
+
3923
+ request_data = ExtractResponse(
3924
+ urls=urls or [],
3925
+ allowExternalLinks=allow_external_links,
3926
+ enableWebSearch=enable_web_search,
3927
+ showSources=show_sources,
3928
+ schema=schema,
3929
+ origin=f'python-sdk@{version}'
3930
+ )
3931
+
3932
+ if prompt:
3933
+ request_data['prompt'] = prompt
3934
+ if system_prompt:
3935
+ request_data['systemPrompt'] = system_prompt
3936
+ if agent:
3937
+ request_data['agent'] = agent
3938
+
3939
+ try:
3940
+ return await self._async_post_request(
3941
+ f'{self.api_url}/v1/extract',
3942
+ request_data,
3943
+ headers
3944
+ )
3945
+ except Exception as e:
3946
+ raise ValueError(str(e))
3947
+
3948
+ async def generate_llms_text(
3949
+ self,
3950
+ url: str,
3951
+ *,
3952
+ max_urls: Optional[int] = None,
3953
+ show_full_text: Optional[bool] = None,
3954
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3955
+ """
3956
+ Generate LLMs.txt for a given URL and monitor until completion.
3957
+
3958
+ Args:
3959
+ url (str): Target URL to generate LLMs.txt from
3960
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
3961
+ show_full_text (Optional[bool]): Include full text in output (default: False)
3962
+ experimental_stream (Optional[bool]): Enable experimental streaming
3963
+
3964
+ Returns:
3965
+ GenerateLLMsTextStatusResponse containing:
3966
+ * success (bool): Whether generation completed successfully
3967
+ * status (str): Status of generation (processing/completed/failed)
3968
+ * data (Dict[str, str], optional): Generated text with fields:
3969
+ - llmstxt (str): Generated LLMs.txt content
3970
+ - llmsfulltxt (str, optional): Full version if requested
3971
+ * error (str, optional): Error message if generation failed
3972
+ * expiresAt (str): When the generated data expires
3973
+
3974
+ Raises:
3975
+ Exception: If generation fails
3976
+ """
3977
+ params = {}
3978
+ if max_urls is not None:
3979
+ params['maxUrls'] = max_urls
3980
+ if show_full_text is not None:
3981
+ params['showFullText'] = show_full_text
3982
+ if experimental_stream is not None:
3983
+ params['__experimental_stream'] = experimental_stream
3984
+
3985
+ response = await self.async_generate_llms_text(
3986
+ url,
3987
+ max_urls=max_urls,
3988
+ show_full_text=show_full_text,
3989
+ experimental_stream=experimental_stream
3990
+ )
3991
+ if not response.get('success') or 'id' not in response:
3992
+ return response
3993
+
3994
+ job_id = response['id']
3995
+ while True:
3996
+ status = await self.check_generate_llms_text_status(job_id)
3997
+
3998
+ if status['status'] == 'completed':
3999
+ return status
4000
+ elif status['status'] == 'failed':
4001
+ raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4002
+ elif status['status'] != 'processing':
4003
+ break
4004
+
4005
+ await asyncio.sleep(2)
4006
+
4007
+ return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4008
+
4009
+ async def async_generate_llms_text(
4010
+ self,
4011
+ url: str,
4012
+ *,
4013
+ max_urls: Optional[int] = None,
4014
+ show_full_text: Optional[bool] = None,
4015
+ experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4016
+ """
4017
+ Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4018
+
4019
+ Args:
4020
+ url (str): Target URL to generate LLMs.txt from
4021
+ max_urls (Optional[int]): Maximum URLs to process (default: 10)
4022
+ show_full_text (Optional[bool]): Include full text in output (default: False)
4023
+ experimental_stream (Optional[bool]): Enable experimental streaming
4024
+
4025
+ Returns:
4026
+ GenerateLLMsTextResponse containing:
4027
+ * success (bool): Whether job started successfully
4028
+ * id (str): Unique identifier for the job
4029
+ * error (str, optional): Error message if start failed
4030
+
4031
+ Raises:
4032
+ ValueError: If job initiation fails
4033
+ """
4034
+ params = {}
4035
+ if max_urls is not None:
4036
+ params['maxUrls'] = max_urls
4037
+ if show_full_text is not None:
4038
+ params['showFullText'] = show_full_text
4039
+ if experimental_stream is not None:
4040
+ params['__experimental_stream'] = experimental_stream
4041
+
4042
+ params = GenerateLLMsTextParams(
4043
+ maxUrls=max_urls,
4044
+ showFullText=show_full_text,
4045
+ __experimental_stream=experimental_stream
4046
+ )
4047
+
4048
+ headers = self._prepare_headers()
4049
+ json_data = {'url': url, **params.dict(exclude_none=True)}
4050
+ json_data['origin'] = f"python-sdk@{version}"
4051
+
4052
+ try:
4053
+ return await self._async_post_request(
4054
+ f'{self.api_url}/v1/llmstxt',
4055
+ json_data,
4056
+ headers
4057
+ )
4058
+ except Exception as e:
4059
+ raise ValueError(str(e))
4060
+
4061
+ async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4062
+ """
4063
+ Check the status of an asynchronous LLMs.txt generation job.
4064
+
4065
+ Args:
4066
+ id (str): The ID of the generation job
4067
+
4068
+ Returns:
4069
+ GenerateLLMsTextStatusResponse containing:
4070
+ * success (bool): Whether generation completed successfully
4071
+ * status (str): Status of generation (processing/completed/failed)
4072
+ * data (Dict[str, str], optional): Generated text with fields:
4073
+ - llmstxt (str): Generated LLMs.txt content
4074
+ - llmsfulltxt (str, optional): Full version if requested
4075
+ * error (str, optional): Error message if generation failed
4076
+ * expiresAt (str): When the generated data expires
4077
+
4078
+ Raises:
4079
+ ValueError: If status check fails
4080
+ """
4081
+ headers = self._prepare_headers()
4082
+ try:
4083
+ return await self._async_get_request(
4084
+ f'{self.api_url}/v1/llmstxt/{id}',
4085
+ headers
4086
+ )
4087
+ except Exception as e:
4088
+ raise ValueError(str(e))
4089
+
4090
+ async def deep_research(
4091
+ self,
4092
+ query: str,
4093
+ *,
4094
+ max_depth: Optional[int] = None,
4095
+ time_limit: Optional[int] = None,
4096
+ max_urls: Optional[int] = None,
4097
+ analysis_prompt: Optional[str] = None,
4098
+ system_prompt: Optional[str] = None,
4099
+ __experimental_stream_steps: Optional[bool] = None,
4100
+ on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4101
+ on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4102
+ """
4103
+ Initiates a deep research operation on a given query and polls until completion.
4104
+
4105
+ Args:
4106
+ query (str): Research query or topic to investigate
4107
+ max_depth (Optional[int]): Maximum depth of research exploration
4108
+ time_limit (Optional[int]): Time limit in seconds for research
4109
+ max_urls (Optional[int]): Maximum number of URLs to process
4110
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4111
+ system_prompt (Optional[str]): Custom system prompt
4112
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4113
+ on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4114
+ on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4115
+
4116
+ Returns:
4117
+ DeepResearchStatusResponse containing:
4118
+ * success (bool): Whether research completed successfully
4119
+ * status (str): Current state (processing/completed/failed)
4120
+ * error (Optional[str]): Error message if failed
4121
+ * id (str): Unique identifier for the research job
4122
+ * data (Any): Research findings and analysis
4123
+ * sources (List[Dict]): List of discovered sources
4124
+ * activities (List[Dict]): Research progress log
4125
+ * summaries (List[str]): Generated research summaries
4126
+
4127
+ Raises:
4128
+ Exception: If research fails
4129
+ """
4130
+ research_params = {}
4131
+ if max_depth is not None:
4132
+ research_params['maxDepth'] = max_depth
4133
+ if time_limit is not None:
4134
+ research_params['timeLimit'] = time_limit
4135
+ if max_urls is not None:
4136
+ research_params['maxUrls'] = max_urls
4137
+ if analysis_prompt is not None:
4138
+ research_params['analysisPrompt'] = analysis_prompt
4139
+ if system_prompt is not None:
4140
+ research_params['systemPrompt'] = system_prompt
4141
+ if __experimental_stream_steps is not None:
4142
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4143
+ research_params = DeepResearchParams(**research_params)
4144
+
4145
+ response = await self.async_deep_research(
4146
+ query,
4147
+ max_depth=max_depth,
4148
+ time_limit=time_limit,
4149
+ max_urls=max_urls,
4150
+ analysis_prompt=analysis_prompt,
4151
+ system_prompt=system_prompt
4152
+ )
4153
+ if not response.get('success') or 'id' not in response:
4154
+ return response
4155
+
4156
+ job_id = response['id']
4157
+ last_activity_count = 0
4158
+ last_source_count = 0
4159
+
4160
+ while True:
4161
+ status = await self.check_deep_research_status(job_id)
4162
+
4163
+ if on_activity and 'activities' in status:
4164
+ new_activities = status['activities'][last_activity_count:]
4165
+ for activity in new_activities:
4166
+ on_activity(activity)
4167
+ last_activity_count = len(status['activities'])
4168
+
4169
+ if on_source and 'sources' in status:
4170
+ new_sources = status['sources'][last_source_count:]
4171
+ for source in new_sources:
4172
+ on_source(source)
4173
+ last_source_count = len(status['sources'])
4174
+
4175
+ if status['status'] == 'completed':
4176
+ return status
4177
+ elif status['status'] == 'failed':
4178
+ raise Exception(f'Deep research failed. Error: {status.get("error")}')
4179
+ elif status['status'] != 'processing':
4180
+ break
4181
+
4182
+ await asyncio.sleep(2)
4183
+
4184
+ return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4185
+
4186
+ async def async_deep_research(
4187
+ self,
4188
+ query: str,
4189
+ *,
4190
+ max_depth: Optional[int] = None,
4191
+ time_limit: Optional[int] = None,
4192
+ max_urls: Optional[int] = None,
4193
+ analysis_prompt: Optional[str] = None,
4194
+ system_prompt: Optional[str] = None,
4195
+ __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4196
+ """
4197
+ Initiates an asynchronous deep research operation.
4198
+
4199
+ Args:
4200
+ query (str): Research query or topic to investigate
4201
+ max_depth (Optional[int]): Maximum depth of research exploration
4202
+ time_limit (Optional[int]): Time limit in seconds for research
4203
+ max_urls (Optional[int]): Maximum number of URLs to process
4204
+ analysis_prompt (Optional[str]): Custom prompt for analysis
4205
+ system_prompt (Optional[str]): Custom system prompt
4206
+ __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4207
+
4208
+ Returns:
4209
+ Dict[str, Any]: A response containing:
4210
+ * success (bool): Whether the research initiation was successful
4211
+ * id (str): The unique identifier for the research job
4212
+ * error (str, optional): Error message if initiation failed
4213
+
4214
+ Raises:
4215
+ Exception: If the research initiation fails.
4216
+ """
4217
+ research_params = {}
4218
+ if max_depth is not None:
4219
+ research_params['maxDepth'] = max_depth
4220
+ if time_limit is not None:
4221
+ research_params['timeLimit'] = time_limit
4222
+ if max_urls is not None:
4223
+ research_params['maxUrls'] = max_urls
4224
+ if analysis_prompt is not None:
4225
+ research_params['analysisPrompt'] = analysis_prompt
4226
+ if system_prompt is not None:
4227
+ research_params['systemPrompt'] = system_prompt
4228
+ if __experimental_stream_steps is not None:
4229
+ research_params['__experimental_streamSteps'] = __experimental_stream_steps
4230
+ research_params = DeepResearchParams(**research_params)
4231
+
4232
+ headers = self._prepare_headers()
4233
+
4234
+ json_data = {'query': query, **research_params.dict(exclude_none=True)}
4235
+ json_data['origin'] = f"python-sdk@{version}"
4236
+
4237
+ try:
4238
+ return await self._async_post_request(
4239
+ f'{self.api_url}/v1/deep-research',
4240
+ json_data,
4241
+ headers
4242
+ )
4243
+ except Exception as e:
4244
+ raise ValueError(str(e))
4245
+
4246
+ async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4247
+ """
4248
+ Check the status of a deep research operation.
4249
+
4250
+ Args:
4251
+ id (str): The ID of the deep research operation.
4252
+
4253
+ Returns:
4254
+ DeepResearchResponse containing:
4255
+
4256
+ Status:
4257
+ * success - Whether research completed successfully
4258
+ * status - Current state (processing/completed/failed)
4259
+ * error - Error message if failed
4260
+
4261
+ Results:
4262
+ * id - Unique identifier for the research job
4263
+ * data - Research findings and analysis
4264
+ * sources - List of discovered sources
4265
+ * activities - Research progress log
4266
+ * summaries - Generated research summaries
4267
+
4268
+ Raises:
4269
+ Exception: If the status check fails.
4270
+ """
4271
+ headers = self._prepare_headers()
4272
+ try:
4273
+ return await self._async_get_request(
4274
+ f'{self.api_url}/v1/deep-research/{id}',
4275
+ headers
4276
+ )
4277
+ except Exception as e:
4278
+ raise ValueError(str(e))
4279
+
4280
+ async def search(
4281
+ self,
4282
+ query: str,
4283
+ *,
4284
+ limit: Optional[int] = None,
4285
+ tbs: Optional[str] = None,
4286
+ filter: Optional[str] = None,
4287
+ lang: Optional[str] = None,
4288
+ country: Optional[str] = None,
4289
+ location: Optional[str] = None,
4290
+ timeout: Optional[int] = None,
4291
+ scrape_options: Optional[ScrapeOptions] = None,
4292
+ params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4293
+ **kwargs) -> SearchResponse:
4294
+ """
4295
+ Asynchronously search for content using Firecrawl.
4296
+
4297
+ Args:
4298
+ query (str): Search query string
4299
+ limit (Optional[int]): Max results (default: 5)
4300
+ tbs (Optional[str]): Time filter (e.g. "qdr:d")
4301
+ filter (Optional[str]): Custom result filter
4302
+ lang (Optional[str]): Language code (default: "en")
4303
+ country (Optional[str]): Country code (default: "us")
4304
+ location (Optional[str]): Geo-targeting
4305
+ timeout (Optional[int]): Request timeout in milliseconds
4306
+ scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4307
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4308
+ **kwargs: Additional keyword arguments for future compatibility
4309
+
4310
+ Returns:
4311
+ SearchResponse: Response containing:
4312
+ * success (bool): Whether request succeeded
4313
+ * data (List[FirecrawlDocument]): Search results
4314
+ * warning (Optional[str]): Warning message if any
4315
+ * error (Optional[str]): Error message if any
4316
+
4317
+ Raises:
4318
+ Exception: If search fails or response cannot be parsed
4319
+ """
4320
+ # Build search parameters
4321
+ search_params = {}
4322
+ if params:
4323
+ if isinstance(params, dict):
4324
+ search_params.update(params)
4325
+ else:
4326
+ search_params.update(params.dict(exclude_none=True))
4327
+
4328
+ # Add individual parameters
4329
+ if limit is not None:
4330
+ search_params['limit'] = limit
4331
+ if tbs is not None:
4332
+ search_params['tbs'] = tbs
4333
+ if filter is not None:
4334
+ search_params['filter'] = filter
4335
+ if lang is not None:
4336
+ search_params['lang'] = lang
4337
+ if country is not None:
4338
+ search_params['country'] = country
4339
+ if location is not None:
4340
+ search_params['location'] = location
4341
+ if timeout is not None:
4342
+ search_params['timeout'] = timeout
4343
+ if scrape_options is not None:
4344
+ search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4345
+
4346
+ # Add any additional kwargs
4347
+ search_params.update(kwargs)
4348
+
4349
+ # Create final params object
4350
+ final_params = SearchParams(query=query, **search_params)
4351
+ params_dict = final_params.dict(exclude_none=True)
4352
+ params_dict['origin'] = f"python-sdk@{version}"
4353
+
4354
+ return await self._async_post_request(
4355
+ f"{self.api_url}/v1/search",
4356
+ params_dict,
4357
+ {"Authorization": f"Bearer {self.api_key}"}
4358
+ )
4359
+
4360
+ class AsyncCrawlWatcher(CrawlWatcher):
4361
+ """
4362
+ Async version of CrawlWatcher that properly handles async operations.
4363
+ """
4364
+ def __init__(self, id: str, app: AsyncFirecrawlApp):
4365
+ super().__init__(id, app)
4366
+
4367
+ async def connect(self) -> None:
4368
+ """
4369
+ Establishes async WebSocket connection and starts listening for messages.
4370
+ """
4371
+ async with websockets.connect(
4372
+ self.ws_url,
4373
+ additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4374
+ ) as websocket:
4375
+ await self._listen(websocket)
4376
+
4377
+ async def _listen(self, websocket) -> None:
4378
+ """
4379
+ Listens for incoming WebSocket messages and handles them asynchronously.
4380
+
4381
+ Args:
4382
+ websocket: The WebSocket connection object
4383
+ """
4384
+ async for message in websocket:
4385
+ msg = json.loads(message)
4386
+ await self._handle_message(msg)
4387
+
4388
+ async def _handle_message(self, msg: Dict[str, Any]) -> None:
4389
+ """
4390
+ Handles incoming WebSocket messages based on their type asynchronously.
4391
+
4392
+ Args:
4393
+ msg (Dict[str, Any]): The message to handle
4394
+ """
4395
+ if msg['type'] == 'done':
4396
+ self.status = 'completed'
4397
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4398
+ elif msg['type'] == 'error':
4399
+ self.status = 'failed'
4400
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4401
+ elif msg['type'] == 'catchup':
4402
+ self.status = msg['data']['status']
4403
+ self.data.extend(msg['data'].get('data', []))
4404
+ for doc in self.data:
4405
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
4406
+ elif msg['type'] == 'document':
4407
+ self.data.append(msg['data'])
4408
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4409
+
4410
+ async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4411
+ """
4412
+ Handle errors from async API responses.
4413
+ """
4414
+ try:
4415
+ error_data = await response.json()
4416
+ error_message = error_data.get('error', 'No error message provided.')
4417
+ error_details = error_data.get('details', 'No additional error details provided.')
4418
+ except:
4419
+ raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4420
+
4421
+ # Use the app's method to get the error message
4422
+ message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4423
+
4424
+ raise aiohttp.ClientError(message)
4425
+
4426
+ async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4427
+ """
4428
+ Generate a standardized error message based on HTTP status code for async operations.
4429
+
4430
+ Args:
4431
+ status_code (int): The HTTP status code from the response
4432
+ action (str): Description of the action that was being performed
4433
+ error_message (str): The error message from the API response
4434
+ error_details (str): Additional error details from the API response
4435
+
4436
+ Returns:
4437
+ str: A formatted error message
4438
+ """
4439
+ return self._get_error_message(status_code, action, error_message, error_details)