firecrawl-py 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

@@ -1,4526 +0,0 @@
1
- """
2
- FirecrawlApp Module
3
-
4
- This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs. The module uses requests for HTTP communication
7
- and handles retries for certain HTTP status codes.
8
-
9
- Classes:
10
- - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
- """
12
- import logging
13
- import os
14
- import time
15
- from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
- import json
17
- from datetime import datetime
18
- import re
19
- import warnings
20
- import requests
21
- import pydantic
22
- import websockets
23
- import aiohttp
24
- import asyncio
25
- from pydantic import Field
26
-
27
- # Suppress Pydantic warnings about attribute shadowing
28
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
-
34
- def get_version():
35
- try:
36
- from pathlib import Path
37
- package_path = os.path.dirname(__file__)
38
- version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
- if version_match:
41
- return version_match.group(1).strip()
42
- except Exception:
43
- print("Failed to get version from __init__.py")
44
- return None
45
-
46
- version = get_version()
47
-
48
- logger : logging.Logger = logging.getLogger("firecrawl")
49
-
50
- T = TypeVar('T')
51
-
52
- # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
- # """Metadata for a Firecrawl document."""
54
- # title: Optional[str] = None
55
- # description: Optional[str] = None
56
- # language: Optional[str] = None
57
- # keywords: Optional[str] = None
58
- # robots: Optional[str] = None
59
- # ogTitle: Optional[str] = None
60
- # ogDescription: Optional[str] = None
61
- # ogUrl: Optional[str] = None
62
- # ogImage: Optional[str] = None
63
- # ogAudio: Optional[str] = None
64
- # ogDeterminer: Optional[str] = None
65
- # ogLocale: Optional[str] = None
66
- # ogLocaleAlternate: Optional[List[str]] = None
67
- # ogSiteName: Optional[str] = None
68
- # ogVideo: Optional[str] = None
69
- # dctermsCreated: Optional[str] = None
70
- # dcDateCreated: Optional[str] = None
71
- # dcDate: Optional[str] = None
72
- # dctermsType: Optional[str] = None
73
- # dcType: Optional[str] = None
74
- # dctermsAudience: Optional[str] = None
75
- # dctermsSubject: Optional[str] = None
76
- # dcSubject: Optional[str] = None
77
- # dcDescription: Optional[str] = None
78
- # dctermsKeywords: Optional[str] = None
79
- # modifiedTime: Optional[str] = None
80
- # publishedTime: Optional[str] = None
81
- # articleTag: Optional[str] = None
82
- # articleSection: Optional[str] = None
83
- # sourceURL: Optional[str] = None
84
- # statusCode: Optional[int] = None
85
- # error: Optional[str] = None
86
-
87
- class AgentOptions(pydantic.BaseModel):
88
- """Configuration for the agent."""
89
- model: Literal["FIRE-1"] = "FIRE-1"
90
- prompt: Optional[str] = None
91
-
92
- class AgentOptionsExtract(pydantic.BaseModel):
93
- """Configuration for the agent in extract operations."""
94
- model: Literal["FIRE-1"] = "FIRE-1"
95
-
96
- class ActionsResult(pydantic.BaseModel):
97
- """Result of actions performed during scraping."""
98
- screenshots: List[str]
99
-
100
- class ChangeTrackingData(pydantic.BaseModel):
101
- """
102
- Data for the change tracking format.
103
- """
104
- previousScrapeAt: Optional[str] = None
105
- changeStatus: str # "new" | "same" | "changed" | "removed"
106
- visibility: str # "visible" | "hidden"
107
- diff: Optional[Dict[str, Any]] = None
108
- json: Optional[Any] = None
109
-
110
- class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
- """Document retrieved or processed by Firecrawl."""
112
- url: Optional[str] = None
113
- markdown: Optional[str] = None
114
- html: Optional[str] = None
115
- rawHtml: Optional[str] = None
116
- links: Optional[List[str]] = None
117
- extract: Optional[T] = None
118
- json: Optional[T] = None
119
- screenshot: Optional[str] = None
120
- metadata: Optional[Any] = None
121
- actions: Optional[ActionsResult] = None
122
- title: Optional[str] = None # v1 search only
123
- description: Optional[str] = None # v1 search only
124
- changeTracking: Optional[ChangeTrackingData] = None
125
-
126
- class LocationConfig(pydantic.BaseModel):
127
- """Location configuration for scraping."""
128
- country: Optional[str] = None
129
- languages: Optional[List[str]] = None
130
-
131
- class WebhookConfig(pydantic.BaseModel):
132
- """Configuration for webhooks."""
133
- url: str
134
- headers: Optional[Dict[str, str]] = None
135
- metadata: Optional[Dict[str, str]] = None
136
- events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
-
138
- class ChangeTrackingOptions(pydantic.BaseModel):
139
- """Configuration for change tracking."""
140
- modes: Optional[List[Literal["git-diff", "json"]]] = None
141
- schema: Optional[Any] = None
142
- prompt: Optional[str] = None
143
- tag: Optional[str] = None
144
-
145
- class ScrapeOptions(pydantic.BaseModel):
146
- """Parameters for scraping operations."""
147
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
148
- headers: Optional[Dict[str, str]] = None
149
- includeTags: Optional[List[str]] = None
150
- excludeTags: Optional[List[str]] = None
151
- onlyMainContent: Optional[bool] = None
152
- waitFor: Optional[int] = None
153
- timeout: Optional[int] = None
154
- location: Optional[LocationConfig] = None
155
- mobile: Optional[bool] = None
156
- skipTlsVerification: Optional[bool] = None
157
- removeBase64Images: Optional[bool] = None
158
- blockAds: Optional[bool] = None
159
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None
160
- changeTrackingOptions: Optional[ChangeTrackingOptions] = None
161
- maxAge: Optional[int] = None
162
- storeInCache: Optional[bool] = None
163
-
164
- class WaitAction(pydantic.BaseModel):
165
- """Wait action to perform during scraping."""
166
- type: Literal["wait"]
167
- milliseconds: Optional[int] = None
168
- selector: Optional[str] = None
169
-
170
- class ScreenshotAction(pydantic.BaseModel):
171
- """Screenshot action to perform during scraping."""
172
- type: Literal["screenshot"]
173
- fullPage: Optional[bool] = None
174
-
175
- class ClickAction(pydantic.BaseModel):
176
- """Click action to perform during scraping."""
177
- type: Literal["click"]
178
- selector: str
179
-
180
- class WriteAction(pydantic.BaseModel):
181
- """Write action to perform during scraping."""
182
- type: Literal["write"]
183
- text: str
184
-
185
- class PressAction(pydantic.BaseModel):
186
- """Press action to perform during scraping."""
187
- type: Literal["press"]
188
- key: str
189
-
190
- class ScrollAction(pydantic.BaseModel):
191
- """Scroll action to perform during scraping."""
192
- type: Literal["scroll"]
193
- direction: Literal["up", "down"]
194
- selector: Optional[str] = None
195
-
196
- class ScrapeAction(pydantic.BaseModel):
197
- """Scrape action to perform during scraping."""
198
- type: Literal["scrape"]
199
-
200
- class ExecuteJavascriptAction(pydantic.BaseModel):
201
- """Execute javascript action to perform during scraping."""
202
- type: Literal["executeJavascript"]
203
- script: str
204
-
205
-
206
- class ExtractAgent(pydantic.BaseModel):
207
- """Configuration for the agent in extract operations."""
208
- model: Literal["FIRE-1"] = "FIRE-1"
209
-
210
- class JsonConfig(pydantic.BaseModel):
211
- """Configuration for extraction."""
212
- prompt: Optional[str] = None
213
- schema: Optional[Any] = None
214
- systemPrompt: Optional[str] = None
215
- agent: Optional[ExtractAgent] = None
216
-
217
- class ScrapeParams(ScrapeOptions):
218
- """Parameters for scraping operations."""
219
- extract: Optional[JsonConfig] = None
220
- jsonOptions: Optional[JsonConfig] = None
221
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
222
- agent: Optional[AgentOptions] = None
223
- webhook: Optional[WebhookConfig] = None
224
-
225
- class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
226
- """Response from scraping operations."""
227
- success: bool = True
228
- warning: Optional[str] = None
229
- error: Optional[str] = None
230
-
231
- class BatchScrapeResponse(pydantic.BaseModel):
232
- """Response from batch scrape operations."""
233
- id: Optional[str] = None
234
- url: Optional[str] = None
235
- success: bool = True
236
- error: Optional[str] = None
237
- invalidURLs: Optional[List[str]] = None
238
-
239
- class BatchScrapeStatusResponse(pydantic.BaseModel):
240
- """Response from batch scrape status checks."""
241
- success: bool = True
242
- status: Literal["scraping", "completed", "failed", "cancelled"]
243
- completed: int
244
- total: int
245
- creditsUsed: int
246
- expiresAt: datetime
247
- next: Optional[str] = None
248
- data: List[FirecrawlDocument]
249
-
250
- class CrawlParams(pydantic.BaseModel):
251
- """Parameters for crawling operations."""
252
- includePaths: Optional[List[str]] = None
253
- excludePaths: Optional[List[str]] = None
254
- maxDepth: Optional[int] = None
255
- maxDiscoveryDepth: Optional[int] = None
256
- limit: Optional[int] = None
257
- allowBackwardLinks: Optional[bool] = None
258
- allowExternalLinks: Optional[bool] = None
259
- ignoreSitemap: Optional[bool] = None
260
- scrapeOptions: Optional[ScrapeOptions] = None
261
- webhook: Optional[Union[str, WebhookConfig]] = None
262
- deduplicateSimilarURLs: Optional[bool] = None
263
- ignoreQueryParameters: Optional[bool] = None
264
- regexOnFullURL: Optional[bool] = None
265
- delay: Optional[int] = None # Delay in seconds between scrapes
266
- maxConcurrency: Optional[int] = None
267
-
268
- class CrawlResponse(pydantic.BaseModel):
269
- """Response from crawling operations."""
270
- id: Optional[str] = None
271
- url: Optional[str] = None
272
- success: bool = True
273
- error: Optional[str] = None
274
-
275
- class CrawlStatusResponse(pydantic.BaseModel):
276
- """Response from crawl status checks."""
277
- success: bool = True
278
- status: Literal["scraping", "completed", "failed", "cancelled"]
279
- completed: int
280
- total: int
281
- creditsUsed: int
282
- expiresAt: datetime
283
- next: Optional[str] = None
284
- data: List[FirecrawlDocument]
285
-
286
- class CrawlErrorsResponse(pydantic.BaseModel):
287
- """Response from crawl/batch scrape error monitoring."""
288
- errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
289
- robotsBlocked: List[str]
290
-
291
- class MapParams(pydantic.BaseModel):
292
- """Parameters for mapping operations."""
293
- search: Optional[str] = None
294
- ignoreSitemap: Optional[bool] = None
295
- includeSubdomains: Optional[bool] = None
296
- sitemapOnly: Optional[bool] = None
297
- limit: Optional[int] = None
298
- timeout: Optional[int] = None
299
- useIndex: Optional[bool] = None
300
-
301
- class MapResponse(pydantic.BaseModel):
302
- """Response from mapping operations."""
303
- success: bool = True
304
- links: Optional[List[str]] = None
305
- error: Optional[str] = None
306
-
307
- class ExtractParams(pydantic.BaseModel):
308
- """Parameters for extracting information from URLs."""
309
- prompt: Optional[str] = None
310
- schema: Optional[Any] = None
311
- systemPrompt: Optional[str] = None
312
- allowExternalLinks: Optional[bool] = None
313
- enableWebSearch: Optional[bool] = None
314
- includeSubdomains: Optional[bool] = None
315
- origin: Optional[str] = None
316
- showSources: Optional[bool] = None
317
- scrapeOptions: Optional[ScrapeOptions] = None
318
-
319
- class ExtractResponse(pydantic.BaseModel, Generic[T]):
320
- """Response from extract operations."""
321
- id: Optional[str] = None
322
- status: Optional[Literal["processing", "completed", "failed"]] = None
323
- expiresAt: Optional[datetime] = None
324
- success: bool = True
325
- data: Optional[T] = None
326
- error: Optional[str] = None
327
- warning: Optional[str] = None
328
- sources: Optional[List[str]] = None
329
-
330
- class SearchParams(pydantic.BaseModel):
331
- query: str
332
- limit: Optional[int] = 5
333
- tbs: Optional[str] = None
334
- filter: Optional[str] = None
335
- lang: Optional[str] = "en"
336
- country: Optional[str] = "us"
337
- location: Optional[str] = None
338
- origin: Optional[str] = "api"
339
- timeout: Optional[int] = 60000
340
- scrapeOptions: Optional[ScrapeOptions] = None
341
-
342
- class SearchResponse(pydantic.BaseModel):
343
- """Response from search operations."""
344
- success: bool = True
345
- data: List[FirecrawlDocument]
346
- warning: Optional[str] = None
347
- error: Optional[str] = None
348
-
349
- class GenerateLLMsTextParams(pydantic.BaseModel):
350
- """
351
- Parameters for the LLMs.txt generation operation.
352
- """
353
- maxUrls: Optional[int] = 10
354
- showFullText: Optional[bool] = False
355
- cache: Optional[bool] = True
356
- __experimental_stream: Optional[bool] = None
357
-
358
- class DeepResearchParams(pydantic.BaseModel):
359
- """
360
- Parameters for the deep research operation.
361
- """
362
- maxDepth: Optional[int] = 7
363
- timeLimit: Optional[int] = 270
364
- maxUrls: Optional[int] = 20
365
- analysisPrompt: Optional[str] = None
366
- systemPrompt: Optional[str] = None
367
- __experimental_streamSteps: Optional[bool] = None
368
-
369
- class DeepResearchResponse(pydantic.BaseModel):
370
- """
371
- Response from the deep research operation.
372
- """
373
- success: bool
374
- id: str
375
- error: Optional[str] = None
376
-
377
- class DeepResearchStatusResponse(pydantic.BaseModel):
378
- """
379
- Status response from the deep research operation.
380
- """
381
- success: bool
382
- data: Optional[Dict[str, Any]] = None
383
- status: str
384
- error: Optional[str] = None
385
- expiresAt: str
386
- currentDepth: int
387
- maxDepth: int
388
- activities: List[Dict[str, Any]]
389
- sources: List[Dict[str, Any]]
390
- summaries: List[str]
391
-
392
- class GenerateLLMsTextResponse(pydantic.BaseModel):
393
- """Response from LLMs.txt generation operations."""
394
- success: bool = True
395
- id: str
396
- error: Optional[str] = None
397
-
398
- class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
399
- llmstxt: str
400
- llmsfulltxt: Optional[str] = None
401
-
402
- class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
403
- """Status response from LLMs.txt generation operations."""
404
- success: bool = True
405
- data: Optional[GenerateLLMsTextStatusResponseData] = None
406
- status: Literal["processing", "completed", "failed"]
407
- error: Optional[str] = None
408
- expiresAt: str
409
-
410
- class SearchResponse(pydantic.BaseModel):
411
- """
412
- Response from the search operation.
413
- """
414
- success: bool
415
- data: List[Dict[str, Any]]
416
- warning: Optional[str] = None
417
- error: Optional[str] = None
418
-
419
- class ExtractParams(pydantic.BaseModel):
420
- """
421
- Parameters for the extract operation.
422
- """
423
- prompt: Optional[str] = None
424
- schema: Optional[Any] = pydantic.Field(None, alias='schema')
425
- system_prompt: Optional[str] = None
426
- allow_external_links: Optional[bool] = False
427
- enable_web_search: Optional[bool] = False
428
- # Just for backwards compatibility
429
- enableWebSearch: Optional[bool] = False
430
- show_sources: Optional[bool] = False
431
- agent: Optional[Dict[str, Any]] = None
432
-
433
- class FirecrawlApp:
434
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
435
- """
436
- Initialize the FirecrawlApp instance with API key, API URL.
437
-
438
- Args:
439
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
440
- api_url (Optional[str]): Base URL for the Firecrawl API.
441
- """
442
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
443
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
444
-
445
- # Only require API key when using cloud service
446
- if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
447
- logger.warning("No API key provided for cloud service")
448
- raise ValueError('No API key provided')
449
-
450
- logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
451
-
452
- def scrape_url(
453
- self,
454
- url: str,
455
- *,
456
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
457
- include_tags: Optional[List[str]] = None,
458
- exclude_tags: Optional[List[str]] = None,
459
- only_main_content: Optional[bool] = None,
460
- wait_for: Optional[int] = None,
461
- timeout: Optional[int] = None,
462
- location: Optional[LocationConfig] = None,
463
- mobile: Optional[bool] = None,
464
- skip_tls_verification: Optional[bool] = None,
465
- remove_base64_images: Optional[bool] = None,
466
- block_ads: Optional[bool] = None,
467
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
468
- extract: Optional[JsonConfig] = None,
469
- json_options: Optional[JsonConfig] = None,
470
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
471
- change_tracking_options: Optional[ChangeTrackingOptions] = None,
472
- max_age: Optional[int] = None,
473
- store_in_cache: Optional[bool] = None,
474
- **kwargs) -> ScrapeResponse[Any]:
475
- """
476
- Scrape and extract content from a URL.
477
-
478
- Args:
479
- url (str): Target URL to scrape
480
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
481
- include_tags (Optional[List[str]]): HTML tags to include
482
- exclude_tags (Optional[List[str]]): HTML tags to exclude
483
- only_main_content (Optional[bool]): Extract main content only
484
- wait_for (Optional[int]): Wait for a specific element to appear
485
- timeout (Optional[int]): Request timeout (ms)
486
- location (Optional[LocationConfig]): Location configuration
487
- mobile (Optional[bool]): Use mobile user agent
488
- skip_tls_verification (Optional[bool]): Skip TLS verification
489
- remove_base64_images (Optional[bool]): Remove base64 images
490
- block_ads (Optional[bool]): Block ads
491
- proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
492
- extract (Optional[JsonConfig]): Content extraction settings
493
- json_options (Optional[JsonConfig]): JSON extraction settings
494
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
495
- change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
496
-
497
-
498
- Returns:
499
- ScrapeResponse with:
500
- * Requested content formats
501
- * Page metadata
502
- * Extraction results
503
- * Success/error status
504
-
505
- Raises:
506
- Exception: If scraping fails
507
- """
508
- headers = self._prepare_headers()
509
-
510
- # Build scrape parameters
511
- scrape_params = {
512
- 'url': url,
513
- 'origin': f"python-sdk@{version}"
514
- }
515
-
516
- # Add optional parameters if provided
517
- if formats:
518
- scrape_params['formats'] = formats
519
- if include_tags:
520
- scrape_params['includeTags'] = include_tags
521
- if exclude_tags:
522
- scrape_params['excludeTags'] = exclude_tags
523
- if only_main_content is not None:
524
- scrape_params['onlyMainContent'] = only_main_content
525
- if wait_for:
526
- scrape_params['waitFor'] = wait_for
527
- if timeout:
528
- scrape_params['timeout'] = timeout
529
- if location:
530
- scrape_params['location'] = location.dict(exclude_none=True)
531
- if mobile is not None:
532
- scrape_params['mobile'] = mobile
533
- if skip_tls_verification is not None:
534
- scrape_params['skipTlsVerification'] = skip_tls_verification
535
- if remove_base64_images is not None:
536
- scrape_params['removeBase64Images'] = remove_base64_images
537
- if block_ads is not None:
538
- scrape_params['blockAds'] = block_ads
539
- if proxy:
540
- scrape_params['proxy'] = proxy
541
- if extract is not None:
542
- extract = self._ensure_schema_dict(extract)
543
- if isinstance(extract, dict) and "schema" in extract:
544
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
545
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
546
- if json_options is not None:
547
- json_options = self._ensure_schema_dict(json_options)
548
- if isinstance(json_options, dict) and "schema" in json_options:
549
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
550
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
551
- if actions:
552
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
553
- if change_tracking_options:
554
- scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
555
- if max_age is not None:
556
- scrape_params['maxAge'] = max_age
557
- if store_in_cache is not None:
558
- scrape_params['storeInCache'] = store_in_cache
559
-
560
- scrape_params.update(kwargs)
561
-
562
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
563
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
564
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
565
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
566
-
567
- # Make request
568
- response = requests.post(
569
- f'{self.api_url}/v1/scrape',
570
- headers=headers,
571
- json=scrape_params,
572
- timeout=(timeout + 5000 if timeout else None)
573
- )
574
-
575
- if response.status_code == 200:
576
- try:
577
- response_json = response.json()
578
- if response_json.get('success') and 'data' in response_json:
579
- return ScrapeResponse(**response_json['data'])
580
- elif "error" in response_json:
581
- raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
582
- else:
583
- raise Exception(f'Failed to scrape URL. Error: {response_json}')
584
- except ValueError:
585
- raise Exception('Failed to parse Firecrawl response as JSON.')
586
- else:
587
- self._handle_error(response, 'scrape URL')
588
-
589
- def search(
590
- self,
591
- query: str,
592
- *,
593
- limit: Optional[int] = None,
594
- tbs: Optional[str] = None,
595
- filter: Optional[str] = None,
596
- lang: Optional[str] = None,
597
- country: Optional[str] = None,
598
- location: Optional[str] = None,
599
- timeout: Optional[int] = None,
600
- scrape_options: Optional[ScrapeOptions] = None,
601
- **kwargs) -> SearchResponse:
602
- """
603
- Search for content using Firecrawl.
604
-
605
- Args:
606
- query (str): Search query string
607
- limit (Optional[int]): Max results (default: 5)
608
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
609
- filter (Optional[str]): Custom result filter
610
- lang (Optional[str]): Language code (default: "en")
611
- country (Optional[str]): Country code (default: "us")
612
- location (Optional[str]): Geo-targeting
613
- timeout (Optional[int]): Request timeout in milliseconds
614
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
615
- **kwargs: Additional keyword arguments for future compatibility
616
-
617
- Returns:
618
- SearchResponse: Response containing:
619
- * success (bool): Whether request succeeded
620
- * data (List[FirecrawlDocument]): Search results
621
- * warning (Optional[str]): Warning message if any
622
- * error (Optional[str]): Error message if any
623
-
624
- Raises:
625
- Exception: If search fails or response cannot be parsed
626
- """
627
- # Validate any additional kwargs
628
- self._validate_kwargs(kwargs, "search")
629
-
630
- # Build search parameters
631
- search_params = {}
632
-
633
- # Add individual parameters
634
- if limit is not None:
635
- search_params['limit'] = limit
636
- if tbs is not None:
637
- search_params['tbs'] = tbs
638
- if filter is not None:
639
- search_params['filter'] = filter
640
- if lang is not None:
641
- search_params['lang'] = lang
642
- if country is not None:
643
- search_params['country'] = country
644
- if location is not None:
645
- search_params['location'] = location
646
- if timeout is not None:
647
- search_params['timeout'] = timeout
648
- if scrape_options is not None:
649
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
650
-
651
- # Add any additional kwargs
652
- search_params.update(kwargs)
653
-
654
- # Create final params object
655
- final_params = SearchParams(query=query, **search_params)
656
- params_dict = final_params.dict(exclude_none=True)
657
- params_dict['origin'] = f"python-sdk@{version}"
658
-
659
- # Make request
660
- response = requests.post(
661
- f"{self.api_url}/v1/search",
662
- headers={"Authorization": f"Bearer {self.api_key}"},
663
- json=params_dict
664
- )
665
-
666
- if response.status_code == 200:
667
- try:
668
- response_json = response.json()
669
- if response_json.get('success') and 'data' in response_json:
670
- return SearchResponse(**response_json)
671
- elif "error" in response_json:
672
- raise Exception(f'Search failed. Error: {response_json["error"]}')
673
- else:
674
- raise Exception(f'Search failed. Error: {response_json}')
675
- except ValueError:
676
- raise Exception('Failed to parse Firecrawl response as JSON.')
677
- else:
678
- self._handle_error(response, 'search')
679
-
680
- def crawl_url(
681
- self,
682
- url: str,
683
- *,
684
- include_paths: Optional[List[str]] = None,
685
- exclude_paths: Optional[List[str]] = None,
686
- max_depth: Optional[int] = None,
687
- max_discovery_depth: Optional[int] = None,
688
- limit: Optional[int] = None,
689
- allow_backward_links: Optional[bool] = None,
690
- crawl_entire_domain: Optional[bool] = None,
691
- allow_external_links: Optional[bool] = None,
692
- ignore_sitemap: Optional[bool] = None,
693
- scrape_options: Optional[ScrapeOptions] = None,
694
- webhook: Optional[Union[str, WebhookConfig]] = None,
695
- deduplicate_similar_urls: Optional[bool] = None,
696
- ignore_query_parameters: Optional[bool] = None,
697
- regex_on_full_url: Optional[bool] = None,
698
- delay: Optional[int] = None,
699
- max_concurrency: Optional[int] = None,
700
- poll_interval: Optional[int] = 2,
701
- idempotency_key: Optional[str] = None,
702
- **kwargs
703
- ) -> CrawlStatusResponse:
704
- """
705
- Crawl a website starting from a URL.
706
-
707
- Args:
708
- url (str): Target URL to start crawling from
709
- include_paths (Optional[List[str]]): Patterns of URLs to include
710
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
711
- max_depth (Optional[int]): Maximum crawl depth
712
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
713
- limit (Optional[int]): Maximum pages to crawl
714
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
715
- crawl_entire_domain (Optional[bool]): Follow parent directory links
716
- allow_external_links (Optional[bool]): Follow external domain links
717
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
718
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
719
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
720
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
721
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
722
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
723
- delay (Optional[int]): Delay in seconds between scrapes
724
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
725
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
726
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
727
- **kwargs: Additional parameters to pass to the API
728
-
729
- Returns:
730
- CrawlStatusResponse with:
731
- * Crawling status and progress
732
- * Crawled page contents
733
- * Success/error information
734
-
735
- Raises:
736
- Exception: If crawl fails
737
- """
738
- # Validate any additional kwargs
739
- self._validate_kwargs(kwargs, "crawl_url")
740
-
741
- crawl_params = {}
742
-
743
- # Add individual parameters
744
- if include_paths is not None:
745
- crawl_params['includePaths'] = include_paths
746
- if exclude_paths is not None:
747
- crawl_params['excludePaths'] = exclude_paths
748
- if max_depth is not None:
749
- crawl_params['maxDepth'] = max_depth
750
- if max_discovery_depth is not None:
751
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
752
- if limit is not None:
753
- crawl_params['limit'] = limit
754
- if crawl_entire_domain is not None:
755
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
756
- elif allow_backward_links is not None:
757
- crawl_params['allowBackwardLinks'] = allow_backward_links
758
- if allow_external_links is not None:
759
- crawl_params['allowExternalLinks'] = allow_external_links
760
- if ignore_sitemap is not None:
761
- crawl_params['ignoreSitemap'] = ignore_sitemap
762
- if scrape_options is not None:
763
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
764
- if webhook is not None:
765
- crawl_params['webhook'] = webhook
766
- if deduplicate_similar_urls is not None:
767
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
768
- if ignore_query_parameters is not None:
769
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
770
- if regex_on_full_url is not None:
771
- crawl_params['regexOnFullURL'] = regex_on_full_url
772
- if delay is not None:
773
- crawl_params['delay'] = delay
774
- if max_concurrency is not None:
775
- crawl_params['maxConcurrency'] = max_concurrency
776
-
777
- # Add any additional kwargs
778
- crawl_params.update(kwargs)
779
-
780
- # Create final params object
781
- final_params = CrawlParams(**crawl_params)
782
- params_dict = final_params.dict(exclude_none=True)
783
- params_dict['url'] = url
784
- params_dict['origin'] = f"python-sdk@{version}"
785
-
786
- # Make request
787
- headers = self._prepare_headers(idempotency_key)
788
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
789
-
790
- if response.status_code == 200:
791
- try:
792
- id = response.json().get('id')
793
- except:
794
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
795
- return self._monitor_job_status(id, headers, poll_interval)
796
- else:
797
- self._handle_error(response, 'start crawl job')
798
-
799
- def async_crawl_url(
800
- self,
801
- url: str,
802
- *,
803
- include_paths: Optional[List[str]] = None,
804
- exclude_paths: Optional[List[str]] = None,
805
- max_depth: Optional[int] = None,
806
- max_discovery_depth: Optional[int] = None,
807
- limit: Optional[int] = None,
808
- allow_backward_links: Optional[bool] = None,
809
- crawl_entire_domain: Optional[bool] = None,
810
- allow_external_links: Optional[bool] = None,
811
- ignore_sitemap: Optional[bool] = None,
812
- scrape_options: Optional[ScrapeOptions] = None,
813
- webhook: Optional[Union[str, WebhookConfig]] = None,
814
- deduplicate_similar_urls: Optional[bool] = None,
815
- ignore_query_parameters: Optional[bool] = None,
816
- regex_on_full_url: Optional[bool] = None,
817
- delay: Optional[int] = None,
818
- idempotency_key: Optional[str] = None,
819
- **kwargs
820
- ) -> CrawlResponse:
821
- """
822
- Start an asynchronous crawl job.
823
-
824
- Args:
825
- url (str): Target URL to start crawling from
826
- include_paths (Optional[List[str]]): Patterns of URLs to include
827
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
828
- max_depth (Optional[int]): Maximum crawl depth
829
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
830
- limit (Optional[int]): Maximum pages to crawl
831
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
832
- crawl_entire_domain (Optional[bool]): Follow parent directory links
833
- allow_external_links (Optional[bool]): Follow external domain links
834
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
835
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
836
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
837
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
838
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
839
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
840
- delay (Optional[int]): Delay in seconds between scrapes
841
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
842
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
843
- **kwargs: Additional parameters to pass to the API
844
-
845
- Returns:
846
- CrawlResponse with:
847
- * success - Whether crawl started successfully
848
- * id - Unique identifier for the crawl job
849
- * url - Status check URL for the crawl
850
- * error - Error message if start failed
851
-
852
- Raises:
853
- Exception: If crawl initiation fails
854
- """
855
- # Validate any additional kwargs
856
- self._validate_kwargs(kwargs, "async_crawl_url")
857
-
858
- crawl_params = {}
859
-
860
- # Add individual parameters
861
- if include_paths is not None:
862
- crawl_params['includePaths'] = include_paths
863
- if exclude_paths is not None:
864
- crawl_params['excludePaths'] = exclude_paths
865
- if max_depth is not None:
866
- crawl_params['maxDepth'] = max_depth
867
- if max_discovery_depth is not None:
868
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
869
- if limit is not None:
870
- crawl_params['limit'] = limit
871
- if crawl_entire_domain is not None:
872
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
873
- elif allow_backward_links is not None:
874
- crawl_params['allowBackwardLinks'] = allow_backward_links
875
- if allow_external_links is not None:
876
- crawl_params['allowExternalLinks'] = allow_external_links
877
- if ignore_sitemap is not None:
878
- crawl_params['ignoreSitemap'] = ignore_sitemap
879
- if scrape_options is not None:
880
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
881
- if webhook is not None:
882
- crawl_params['webhook'] = webhook
883
- if deduplicate_similar_urls is not None:
884
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
885
- if ignore_query_parameters is not None:
886
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
887
- if regex_on_full_url is not None:
888
- crawl_params['regexOnFullURL'] = regex_on_full_url
889
- if delay is not None:
890
- crawl_params['delay'] = delay
891
- if max_concurrency is not None:
892
- crawl_params['maxConcurrency'] = max_concurrency
893
-
894
- # Add any additional kwargs
895
- crawl_params.update(kwargs)
896
-
897
- # Create final params object
898
- final_params = CrawlParams(**crawl_params)
899
- params_dict = final_params.dict(exclude_none=True)
900
- params_dict['url'] = url
901
- params_dict['origin'] = f"python-sdk@{version}"
902
-
903
- # Make request
904
- headers = self._prepare_headers(idempotency_key)
905
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
906
-
907
- if response.status_code == 200:
908
- try:
909
- return CrawlResponse(**response.json())
910
- except:
911
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
912
- else:
913
- self._handle_error(response, 'start crawl job')
914
-
915
- def check_crawl_status(self, id: str) -> CrawlStatusResponse:
916
- """
917
- Check the status and results of a crawl job.
918
-
919
- Args:
920
- id: Unique identifier for the crawl job
921
-
922
- Returns:
923
- CrawlStatusResponse containing:
924
-
925
- Status Information:
926
- * status - Current state (scraping/completed/failed/cancelled)
927
- * completed - Number of pages crawled
928
- * total - Total pages to crawl
929
- * creditsUsed - API credits consumed
930
- * expiresAt - Data expiration timestamp
931
-
932
- Results:
933
- * data - List of crawled documents
934
- * next - URL for next page of results (if paginated)
935
- * success - Whether status check succeeded
936
- * error - Error message if failed
937
-
938
- Raises:
939
- Exception: If status check fails
940
- """
941
- endpoint = f'/v1/crawl/{id}'
942
-
943
- headers = self._prepare_headers()
944
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
945
- if response.status_code == 200:
946
- try:
947
- status_data = response.json()
948
- except:
949
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
950
- if status_data['status'] == 'completed':
951
- if 'data' in status_data:
952
- data = status_data['data']
953
- while 'next' in status_data:
954
- if len(status_data['data']) == 0:
955
- break
956
- next_url = status_data.get('next')
957
- if not next_url:
958
- logger.warning("Expected 'next' URL is missing.")
959
- break
960
- try:
961
- status_response = self._get_request(next_url, headers)
962
- if status_response.status_code != 200:
963
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
964
- break
965
- try:
966
- next_data = status_response.json()
967
- except:
968
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
969
- data.extend(next_data.get('data', []))
970
- status_data = next_data
971
- except Exception as e:
972
- logger.error(f"Error during pagination request: {e}")
973
- break
974
- status_data['data'] = data
975
-
976
- response = {
977
- 'status': status_data.get('status'),
978
- 'total': status_data.get('total'),
979
- 'completed': status_data.get('completed'),
980
- 'creditsUsed': status_data.get('creditsUsed'),
981
- 'expiresAt': status_data.get('expiresAt'),
982
- 'data': status_data.get('data')
983
- }
984
-
985
- if 'error' in status_data:
986
- response['error'] = status_data['error']
987
-
988
- if 'next' in status_data:
989
- response['next'] = status_data['next']
990
-
991
- return CrawlStatusResponse(
992
- success=False if 'error' in status_data else True,
993
- **response
994
- )
995
- else:
996
- self._handle_error(response, 'check crawl status')
997
-
998
- def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
999
- """
1000
- Returns information about crawl errors.
1001
-
1002
- Args:
1003
- id (str): The ID of the crawl job
1004
-
1005
- Returns:
1006
- CrawlErrorsResponse containing:
1007
- * errors (List[Dict[str, str]]): List of errors with fields:
1008
- - id (str): Error ID
1009
- - timestamp (str): When the error occurred
1010
- - url (str): URL that caused the error
1011
- - error (str): Error message
1012
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1013
-
1014
- Raises:
1015
- Exception: If error check fails
1016
- """
1017
- headers = self._prepare_headers()
1018
- response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1019
- if response.status_code == 200:
1020
- try:
1021
- return CrawlErrorsResponse(**response.json())
1022
- except:
1023
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1024
- else:
1025
- self._handle_error(response, "check crawl errors")
1026
-
1027
- def cancel_crawl(self, id: str) -> Dict[str, Any]:
1028
- """
1029
- Cancel an asynchronous crawl job.
1030
-
1031
- Args:
1032
- id (str): The ID of the crawl job to cancel
1033
-
1034
- Returns:
1035
- Dict[str, Any] containing:
1036
- * success (bool): Whether cancellation was successful
1037
- * error (str, optional): Error message if cancellation failed
1038
-
1039
- Raises:
1040
- Exception: If cancellation fails
1041
- """
1042
- headers = self._prepare_headers()
1043
- response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1044
- if response.status_code == 200:
1045
- try:
1046
- return response.json()
1047
- except:
1048
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1049
- else:
1050
- self._handle_error(response, "cancel crawl job")
1051
-
1052
- def crawl_url_and_watch(
1053
- self,
1054
- url: str,
1055
- *,
1056
- include_paths: Optional[List[str]] = None,
1057
- exclude_paths: Optional[List[str]] = None,
1058
- max_depth: Optional[int] = None,
1059
- max_discovery_depth: Optional[int] = None,
1060
- limit: Optional[int] = None,
1061
- allow_backward_links: Optional[bool] = None,
1062
- crawl_entire_domain: Optional[bool] = None,
1063
- allow_external_links: Optional[bool] = None,
1064
- ignore_sitemap: Optional[bool] = None,
1065
- scrape_options: Optional[ScrapeOptions] = None,
1066
- webhook: Optional[Union[str, WebhookConfig]] = None,
1067
- deduplicate_similar_urls: Optional[bool] = None,
1068
- ignore_query_parameters: Optional[bool] = None,
1069
- regex_on_full_url: Optional[bool] = None,
1070
- delay: Optional[int] = None,
1071
- max_concurrency: Optional[int] = None,
1072
- idempotency_key: Optional[str] = None,
1073
- **kwargs
1074
- ) -> 'CrawlWatcher':
1075
- """
1076
- Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1077
-
1078
- Args:
1079
- url (str): Target URL to start crawling from
1080
- include_paths (Optional[List[str]]): Patterns of URLs to include
1081
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1082
- max_depth (Optional[int]): Maximum crawl depth
1083
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1084
- limit (Optional[int]): Maximum pages to crawl
1085
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1086
- crawl_entire_domain (Optional[bool]): Follow parent directory links
1087
- allow_external_links (Optional[bool]): Follow external domain links
1088
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1089
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1090
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1091
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1092
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
1093
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
1094
- delay (Optional[int]): Delay in seconds between scrapes
1095
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1096
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1097
- **kwargs: Additional parameters to pass to the API
1098
-
1099
- Returns:
1100
- CrawlWatcher: An instance to monitor the crawl job via WebSocket
1101
-
1102
- Raises:
1103
- Exception: If crawl job fails to start
1104
- """
1105
- crawl_response = self.async_crawl_url(
1106
- url,
1107
- include_paths=include_paths,
1108
- exclude_paths=exclude_paths,
1109
- max_depth=max_depth,
1110
- max_discovery_depth=max_discovery_depth,
1111
- limit=limit,
1112
- allow_backward_links=allow_backward_links,
1113
- allow_external_links=allow_external_links,
1114
- ignore_sitemap=ignore_sitemap,
1115
- scrape_options=scrape_options,
1116
- webhook=webhook,
1117
- deduplicate_similar_urls=deduplicate_similar_urls,
1118
- ignore_query_parameters=ignore_query_parameters,
1119
- regex_on_full_url=regex_on_full_url,
1120
- delay=delay,
1121
- max_concurrency=max_concurrency,
1122
- idempotency_key=idempotency_key,
1123
- **kwargs
1124
- )
1125
- if crawl_response.success and crawl_response.id:
1126
- return CrawlWatcher(crawl_response.id, self)
1127
- else:
1128
- raise Exception("Crawl job failed to start")
1129
-
1130
- def map_url(
1131
- self,
1132
- url: str,
1133
- *,
1134
- search: Optional[str] = None,
1135
- ignore_sitemap: Optional[bool] = None,
1136
- include_subdomains: Optional[bool] = None,
1137
- sitemap_only: Optional[bool] = None,
1138
- limit: Optional[int] = None,
1139
- timeout: Optional[int] = None,
1140
- use_index: Optional[bool] = None,
1141
- **kwargs) -> MapResponse:
1142
- """
1143
- Map and discover links from a URL.
1144
-
1145
- Args:
1146
- url (str): Target URL to map
1147
- search (Optional[str]): Filter pattern for URLs
1148
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1149
- include_subdomains (Optional[bool]): Include subdomain links
1150
- sitemap_only (Optional[bool]): Only use sitemap.xml
1151
- limit (Optional[int]): Maximum URLs to return
1152
- timeout (Optional[int]): Request timeout in milliseconds
1153
- **kwargs: Additional parameters to pass to the API
1154
-
1155
- Returns:
1156
- MapResponse: Response containing:
1157
- * success (bool): Whether request succeeded
1158
- * links (List[str]): Discovered URLs
1159
- * error (Optional[str]): Error message if any
1160
-
1161
- Raises:
1162
- Exception: If mapping fails or response cannot be parsed
1163
- """
1164
- # Validate any additional kwargs
1165
- self._validate_kwargs(kwargs, "map_url")
1166
-
1167
- # Build map parameters
1168
- map_params = {}
1169
-
1170
- # Add individual parameters
1171
- if search is not None:
1172
- map_params['search'] = search
1173
- if ignore_sitemap is not None:
1174
- map_params['ignoreSitemap'] = ignore_sitemap
1175
- if include_subdomains is not None:
1176
- map_params['includeSubdomains'] = include_subdomains
1177
- if sitemap_only is not None:
1178
- map_params['sitemapOnly'] = sitemap_only
1179
- if limit is not None:
1180
- map_params['limit'] = limit
1181
- if timeout is not None:
1182
- map_params['timeout'] = timeout
1183
- if use_index is not None:
1184
- map_params['useIndex'] = use_index
1185
-
1186
- # Add any additional kwargs
1187
- map_params.update(kwargs)
1188
-
1189
- # Create final params object
1190
- final_params = MapParams(**map_params)
1191
- params_dict = final_params.dict(exclude_none=True)
1192
- params_dict['url'] = url
1193
- params_dict['origin'] = f"python-sdk@{version}"
1194
-
1195
- # Make request
1196
- response = requests.post(
1197
- f"{self.api_url}/v1/map",
1198
- headers={"Authorization": f"Bearer {self.api_key}"},
1199
- json=params_dict
1200
- )
1201
-
1202
- if response.status_code == 200:
1203
- try:
1204
- response_json = response.json()
1205
- if response_json.get('success') and 'links' in response_json:
1206
- return MapResponse(**response_json)
1207
- elif "error" in response_json:
1208
- raise Exception(f'Map failed. Error: {response_json["error"]}')
1209
- else:
1210
- raise Exception(f'Map failed. Error: {response_json}')
1211
- except ValueError:
1212
- raise Exception('Failed to parse Firecrawl response as JSON.')
1213
- else:
1214
- self._handle_error(response, 'map')
1215
-
1216
- def batch_scrape_urls(
1217
- self,
1218
- urls: List[str],
1219
- *,
1220
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1221
- headers: Optional[Dict[str, str]] = None,
1222
- include_tags: Optional[List[str]] = None,
1223
- exclude_tags: Optional[List[str]] = None,
1224
- only_main_content: Optional[bool] = None,
1225
- wait_for: Optional[int] = None,
1226
- timeout: Optional[int] = None,
1227
- location: Optional[LocationConfig] = None,
1228
- mobile: Optional[bool] = None,
1229
- skip_tls_verification: Optional[bool] = None,
1230
- remove_base64_images: Optional[bool] = None,
1231
- block_ads: Optional[bool] = None,
1232
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1233
- extract: Optional[JsonConfig] = None,
1234
- json_options: Optional[JsonConfig] = None,
1235
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1236
- agent: Optional[AgentOptions] = None,
1237
- poll_interval: Optional[int] = 2,
1238
- max_concurrency: Optional[int] = None,
1239
- idempotency_key: Optional[str] = None,
1240
- **kwargs
1241
- ) -> BatchScrapeStatusResponse:
1242
- """
1243
- Batch scrape multiple URLs and monitor until completion.
1244
-
1245
- Args:
1246
- urls (List[str]): URLs to scrape
1247
- formats (Optional[List[Literal]]): Content formats to retrieve
1248
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1249
- include_tags (Optional[List[str]]): HTML tags to include
1250
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1251
- only_main_content (Optional[bool]): Extract main content only
1252
- wait_for (Optional[int]): Wait time in milliseconds
1253
- timeout (Optional[int]): Request timeout in milliseconds
1254
- location (Optional[LocationConfig]): Location configuration
1255
- mobile (Optional[bool]): Use mobile user agent
1256
- skip_tls_verification (Optional[bool]): Skip TLS verification
1257
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1258
- block_ads (Optional[bool]): Block advertisements
1259
- proxy (Optional[Literal]): Proxy type to use
1260
- extract (Optional[JsonConfig]): Content extraction config
1261
- json_options (Optional[JsonConfig]): JSON extraction config
1262
- actions (Optional[List[Union]]): Actions to perform
1263
- agent (Optional[AgentOptions]): Agent configuration
1264
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1265
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
1266
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1267
- **kwargs: Additional parameters to pass to the API
1268
-
1269
- Returns:
1270
- BatchScrapeStatusResponse with:
1271
- * Scraping status and progress
1272
- * Scraped content for each URL
1273
- * Success/error information
1274
-
1275
- Raises:
1276
- Exception: If batch scrape fails
1277
- """
1278
- # Validate any additional kwargs
1279
- self._validate_kwargs(kwargs, "batch_scrape_urls")
1280
-
1281
- scrape_params = {}
1282
-
1283
- # Add individual parameters
1284
- if formats is not None:
1285
- scrape_params['formats'] = formats
1286
- if headers is not None:
1287
- scrape_params['headers'] = headers
1288
- if include_tags is not None:
1289
- scrape_params['includeTags'] = include_tags
1290
- if exclude_tags is not None:
1291
- scrape_params['excludeTags'] = exclude_tags
1292
- if only_main_content is not None:
1293
- scrape_params['onlyMainContent'] = only_main_content
1294
- if wait_for is not None:
1295
- scrape_params['waitFor'] = wait_for
1296
- if timeout is not None:
1297
- scrape_params['timeout'] = timeout
1298
- if location is not None:
1299
- scrape_params['location'] = location.dict(exclude_none=True)
1300
- if mobile is not None:
1301
- scrape_params['mobile'] = mobile
1302
- if skip_tls_verification is not None:
1303
- scrape_params['skipTlsVerification'] = skip_tls_verification
1304
- if remove_base64_images is not None:
1305
- scrape_params['removeBase64Images'] = remove_base64_images
1306
- if block_ads is not None:
1307
- scrape_params['blockAds'] = block_ads
1308
- if proxy is not None:
1309
- scrape_params['proxy'] = proxy
1310
- if extract is not None:
1311
- extract = self._ensure_schema_dict(extract)
1312
- if isinstance(extract, dict) and "schema" in extract:
1313
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1314
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1315
- if json_options is not None:
1316
- json_options = self._ensure_schema_dict(json_options)
1317
- if isinstance(json_options, dict) and "schema" in json_options:
1318
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1319
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1320
- if actions is not None:
1321
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1322
- if agent is not None:
1323
- scrape_params['agent'] = agent.dict(exclude_none=True)
1324
- if max_concurrency is not None:
1325
- scrape_params['maxConcurrency'] = max_concurrency
1326
-
1327
- # Add any additional kwargs
1328
- scrape_params.update(kwargs)
1329
-
1330
- # Create final params object
1331
- final_params = ScrapeParams(**scrape_params)
1332
- params_dict = final_params.dict(exclude_none=True)
1333
- params_dict['urls'] = urls
1334
- params_dict['origin'] = f"python-sdk@{version}"
1335
-
1336
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1337
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1338
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1339
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1340
-
1341
- # Make request
1342
- headers = self._prepare_headers(idempotency_key)
1343
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1344
-
1345
- if response.status_code == 200:
1346
- try:
1347
- id = response.json().get('id')
1348
- except:
1349
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1350
- return self._monitor_job_status(id, headers, poll_interval)
1351
- else:
1352
- self._handle_error(response, 'start batch scrape job')
1353
-
1354
- def async_batch_scrape_urls(
1355
- self,
1356
- urls: List[str],
1357
- *,
1358
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1359
- headers: Optional[Dict[str, str]] = None,
1360
- include_tags: Optional[List[str]] = None,
1361
- exclude_tags: Optional[List[str]] = None,
1362
- only_main_content: Optional[bool] = None,
1363
- wait_for: Optional[int] = None,
1364
- timeout: Optional[int] = None,
1365
- location: Optional[LocationConfig] = None,
1366
- mobile: Optional[bool] = None,
1367
- skip_tls_verification: Optional[bool] = None,
1368
- remove_base64_images: Optional[bool] = None,
1369
- block_ads: Optional[bool] = None,
1370
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1371
- extract: Optional[JsonConfig] = None,
1372
- json_options: Optional[JsonConfig] = None,
1373
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1374
- agent: Optional[AgentOptions] = None,
1375
- max_concurrency: Optional[int] = None,
1376
- idempotency_key: Optional[str] = None,
1377
- **kwargs
1378
- ) -> BatchScrapeResponse:
1379
- """
1380
- Initiate a batch scrape job asynchronously.
1381
-
1382
- Args:
1383
- urls (List[str]): URLs to scrape
1384
- formats (Optional[List[Literal]]): Content formats to retrieve
1385
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1386
- include_tags (Optional[List[str]]): HTML tags to include
1387
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1388
- only_main_content (Optional[bool]): Extract main content only
1389
- wait_for (Optional[int]): Wait time in milliseconds
1390
- timeout (Optional[int]): Request timeout in milliseconds
1391
- location (Optional[LocationConfig]): Location configuration
1392
- mobile (Optional[bool]): Use mobile user agent
1393
- skip_tls_verification (Optional[bool]): Skip TLS verification
1394
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1395
- block_ads (Optional[bool]): Block advertisements
1396
- proxy (Optional[Literal]): Proxy type to use
1397
- extract (Optional[JsonConfig]): Content extraction config
1398
- json_options (Optional[JsonConfig]): JSON extraction config
1399
- actions (Optional[List[Union]]): Actions to perform
1400
- agent (Optional[AgentOptions]): Agent configuration
1401
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1402
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1403
- **kwargs: Additional parameters to pass to the API
1404
-
1405
- Returns:
1406
- BatchScrapeResponse with:
1407
- * success - Whether job started successfully
1408
- * id - Unique identifier for the job
1409
- * url - Status check URL
1410
- * error - Error message if start failed
1411
-
1412
- Raises:
1413
- Exception: If job initiation fails
1414
- """
1415
- # Validate any additional kwargs
1416
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1417
-
1418
- scrape_params = {}
1419
-
1420
- # Add individual parameters
1421
- if formats is not None:
1422
- scrape_params['formats'] = formats
1423
- if headers is not None:
1424
- scrape_params['headers'] = headers
1425
- if include_tags is not None:
1426
- scrape_params['includeTags'] = include_tags
1427
- if exclude_tags is not None:
1428
- scrape_params['excludeTags'] = exclude_tags
1429
- if only_main_content is not None:
1430
- scrape_params['onlyMainContent'] = only_main_content
1431
- if wait_for is not None:
1432
- scrape_params['waitFor'] = wait_for
1433
- if timeout is not None:
1434
- scrape_params['timeout'] = timeout
1435
- if location is not None:
1436
- scrape_params['location'] = location.dict(exclude_none=True)
1437
- if mobile is not None:
1438
- scrape_params['mobile'] = mobile
1439
- if skip_tls_verification is not None:
1440
- scrape_params['skipTlsVerification'] = skip_tls_verification
1441
- if remove_base64_images is not None:
1442
- scrape_params['removeBase64Images'] = remove_base64_images
1443
- if block_ads is not None:
1444
- scrape_params['blockAds'] = block_ads
1445
- if proxy is not None:
1446
- scrape_params['proxy'] = proxy
1447
- if extract is not None:
1448
- extract = self._ensure_schema_dict(extract)
1449
- if isinstance(extract, dict) and "schema" in extract:
1450
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1451
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1452
- if json_options is not None:
1453
- json_options = self._ensure_schema_dict(json_options)
1454
- if isinstance(json_options, dict) and "schema" in json_options:
1455
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1456
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1457
- if actions is not None:
1458
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1459
- if agent is not None:
1460
- scrape_params['agent'] = agent.dict(exclude_none=True)
1461
- if max_concurrency is not None:
1462
- scrape_params['maxConcurrency'] = max_concurrency
1463
-
1464
- # Add any additional kwargs
1465
- scrape_params.update(kwargs)
1466
-
1467
- # Create final params object
1468
- final_params = ScrapeParams(**scrape_params)
1469
- params_dict = final_params.dict(exclude_none=True)
1470
- params_dict['urls'] = urls
1471
- params_dict['origin'] = f"python-sdk@{version}"
1472
-
1473
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1474
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1475
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1476
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1477
-
1478
- # Make request
1479
- headers = self._prepare_headers(idempotency_key)
1480
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1481
-
1482
- if response.status_code == 200:
1483
- try:
1484
- return BatchScrapeResponse(**response.json())
1485
- except:
1486
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1487
- else:
1488
- self._handle_error(response, 'start batch scrape job')
1489
-
1490
- def batch_scrape_urls_and_watch(
1491
- self,
1492
- urls: List[str],
1493
- *,
1494
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1495
- headers: Optional[Dict[str, str]] = None,
1496
- include_tags: Optional[List[str]] = None,
1497
- exclude_tags: Optional[List[str]] = None,
1498
- only_main_content: Optional[bool] = None,
1499
- wait_for: Optional[int] = None,
1500
- timeout: Optional[int] = None,
1501
- location: Optional[LocationConfig] = None,
1502
- mobile: Optional[bool] = None,
1503
- skip_tls_verification: Optional[bool] = None,
1504
- remove_base64_images: Optional[bool] = None,
1505
- block_ads: Optional[bool] = None,
1506
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1507
- extract: Optional[JsonConfig] = None,
1508
- json_options: Optional[JsonConfig] = None,
1509
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1510
- agent: Optional[AgentOptions] = None,
1511
- max_concurrency: Optional[int] = None,
1512
- idempotency_key: Optional[str] = None,
1513
- **kwargs
1514
- ) -> 'CrawlWatcher':
1515
- """
1516
- Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1517
-
1518
- Args:
1519
- urls (List[str]): URLs to scrape
1520
- formats (Optional[List[Literal]]): Content formats to retrieve
1521
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1522
- include_tags (Optional[List[str]]): HTML tags to include
1523
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1524
- only_main_content (Optional[bool]): Extract main content only
1525
- wait_for (Optional[int]): Wait time in milliseconds
1526
- timeout (Optional[int]): Request timeout in milliseconds
1527
- location (Optional[LocationConfig]): Location configuration
1528
- mobile (Optional[bool]): Use mobile user agent
1529
- skip_tls_verification (Optional[bool]): Skip TLS verification
1530
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1531
- block_ads (Optional[bool]): Block advertisements
1532
- proxy (Optional[Literal]): Proxy type to use
1533
- extract (Optional[JsonConfig]): Content extraction config
1534
- json_options (Optional[JsonConfig]): JSON extraction config
1535
- actions (Optional[List[Union]]): Actions to perform
1536
- agent (Optional[AgentOptions]): Agent configuration
1537
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1538
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1539
- **kwargs: Additional parameters to pass to the API
1540
-
1541
- Returns:
1542
- CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1543
-
1544
- Raises:
1545
- Exception: If batch scrape job fails to start
1546
- """
1547
- # Validate any additional kwargs
1548
- self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1549
-
1550
- scrape_params = {}
1551
-
1552
- # Add individual parameters
1553
- if formats is not None:
1554
- scrape_params['formats'] = formats
1555
- if headers is not None:
1556
- scrape_params['headers'] = headers
1557
- if include_tags is not None:
1558
- scrape_params['includeTags'] = include_tags
1559
- if exclude_tags is not None:
1560
- scrape_params['excludeTags'] = exclude_tags
1561
- if only_main_content is not None:
1562
- scrape_params['onlyMainContent'] = only_main_content
1563
- if wait_for is not None:
1564
- scrape_params['waitFor'] = wait_for
1565
- if timeout is not None:
1566
- scrape_params['timeout'] = timeout
1567
- if location is not None:
1568
- scrape_params['location'] = location.dict(exclude_none=True)
1569
- if mobile is not None:
1570
- scrape_params['mobile'] = mobile
1571
- if skip_tls_verification is not None:
1572
- scrape_params['skipTlsVerification'] = skip_tls_verification
1573
- if remove_base64_images is not None:
1574
- scrape_params['removeBase64Images'] = remove_base64_images
1575
- if block_ads is not None:
1576
- scrape_params['blockAds'] = block_ads
1577
- if proxy is not None:
1578
- scrape_params['proxy'] = proxy
1579
- if extract is not None:
1580
- extract = self._ensure_schema_dict(extract)
1581
- if isinstance(extract, dict) and "schema" in extract:
1582
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1583
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1584
- if json_options is not None:
1585
- json_options = self._ensure_schema_dict(json_options)
1586
- if isinstance(json_options, dict) and "schema" in json_options:
1587
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1588
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1589
- if actions is not None:
1590
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1591
- if agent is not None:
1592
- scrape_params['agent'] = agent.dict(exclude_none=True)
1593
- if max_concurrency is not None:
1594
- scrape_params['maxConcurrency'] = max_concurrency
1595
-
1596
- # Add any additional kwargs
1597
- scrape_params.update(kwargs)
1598
-
1599
- # Create final params object
1600
- final_params = ScrapeParams(**scrape_params)
1601
- params_dict = final_params.dict(exclude_none=True)
1602
- params_dict['urls'] = urls
1603
- params_dict['origin'] = f"python-sdk@{version}"
1604
-
1605
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1606
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1607
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1608
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1609
-
1610
- # Make request
1611
- headers = self._prepare_headers(idempotency_key)
1612
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1613
-
1614
- if response.status_code == 200:
1615
- try:
1616
- crawl_response = BatchScrapeResponse(**response.json())
1617
- if crawl_response.success and crawl_response.id:
1618
- return CrawlWatcher(crawl_response.id, self)
1619
- else:
1620
- raise Exception("Batch scrape job failed to start")
1621
- except:
1622
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1623
- else:
1624
- self._handle_error(response, 'start batch scrape job')
1625
-
1626
- def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1627
- """
1628
- Check the status of a batch scrape job using the Firecrawl API.
1629
-
1630
- Args:
1631
- id (str): The ID of the batch scrape job.
1632
-
1633
- Returns:
1634
- BatchScrapeStatusResponse: The status of the batch scrape job.
1635
-
1636
- Raises:
1637
- Exception: If the status check request fails.
1638
- """
1639
- endpoint = f'/v1/batch/scrape/{id}'
1640
-
1641
- headers = self._prepare_headers()
1642
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
1643
- if response.status_code == 200:
1644
- try:
1645
- status_data = response.json()
1646
- except:
1647
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1648
- if status_data['status'] == 'completed':
1649
- if 'data' in status_data:
1650
- data = status_data['data']
1651
- while 'next' in status_data:
1652
- if len(status_data['data']) == 0:
1653
- break
1654
- next_url = status_data.get('next')
1655
- if not next_url:
1656
- logger.warning("Expected 'next' URL is missing.")
1657
- break
1658
- try:
1659
- status_response = self._get_request(next_url, headers)
1660
- if status_response.status_code != 200:
1661
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
1662
- break
1663
- try:
1664
- next_data = status_response.json()
1665
- except:
1666
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1667
- data.extend(next_data.get('data', []))
1668
- status_data = next_data
1669
- except Exception as e:
1670
- logger.error(f"Error during pagination request: {e}")
1671
- break
1672
- status_data['data'] = data
1673
-
1674
- return BatchScrapeStatusResponse(**{
1675
- 'success': False if 'error' in status_data else True,
1676
- 'status': status_data.get('status'),
1677
- 'total': status_data.get('total'),
1678
- 'completed': status_data.get('completed'),
1679
- 'creditsUsed': status_data.get('creditsUsed'),
1680
- 'expiresAt': status_data.get('expiresAt'),
1681
- 'data': status_data.get('data'),
1682
- 'next': status_data.get('next'),
1683
- 'error': status_data.get('error')
1684
- })
1685
- else:
1686
- self._handle_error(response, 'check batch scrape status')
1687
-
1688
- def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1689
- """
1690
- Returns information about batch scrape errors.
1691
-
1692
- Args:
1693
- id (str): The ID of the crawl job.
1694
-
1695
- Returns:
1696
- CrawlErrorsResponse containing:
1697
- * errors (List[Dict[str, str]]): List of errors with fields:
1698
- * id (str): Error ID
1699
- * timestamp (str): When the error occurred
1700
- * url (str): URL that caused the error
1701
- * error (str): Error message
1702
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1703
-
1704
- Raises:
1705
- Exception: If the error check request fails
1706
- """
1707
- headers = self._prepare_headers()
1708
- response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1709
- if response.status_code == 200:
1710
- try:
1711
- return CrawlErrorsResponse(**response.json())
1712
- except:
1713
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1714
- else:
1715
- self._handle_error(response, "check batch scrape errors")
1716
-
1717
- def extract(
1718
- self,
1719
- urls: Optional[List[str]] = None,
1720
- *,
1721
- prompt: Optional[str] = None,
1722
- schema: Optional[Any] = None,
1723
- system_prompt: Optional[str] = None,
1724
- allow_external_links: Optional[bool] = False,
1725
- enable_web_search: Optional[bool] = False,
1726
- show_sources: Optional[bool] = False,
1727
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1728
- """
1729
- Extract structured information from URLs.
1730
-
1731
- Args:
1732
- urls (Optional[List[str]]): URLs to extract from
1733
- prompt (Optional[str]): Custom extraction prompt
1734
- schema (Optional[Any]): JSON schema/Pydantic model
1735
- system_prompt (Optional[str]): System context
1736
- allow_external_links (Optional[bool]): Follow external links
1737
- enable_web_search (Optional[bool]): Enable web search
1738
- show_sources (Optional[bool]): Include source URLs
1739
- agent (Optional[Dict[str, Any]]): Agent configuration
1740
-
1741
- Returns:
1742
- ExtractResponse[Any] with:
1743
- * success (bool): Whether request succeeded
1744
- * data (Optional[Any]): Extracted data matching schema
1745
- * error (Optional[str]): Error message if any
1746
-
1747
- Raises:
1748
- ValueError: If prompt/schema missing or extraction fails
1749
- """
1750
- headers = self._prepare_headers()
1751
-
1752
- if not prompt and not schema:
1753
- raise ValueError("Either prompt or schema is required")
1754
-
1755
- if not urls and not prompt:
1756
- raise ValueError("Either urls or prompt is required")
1757
-
1758
- if schema:
1759
- schema = self._ensure_schema_dict(schema)
1760
-
1761
- request_data = {
1762
- 'urls': urls or [],
1763
- 'allowExternalLinks': allow_external_links,
1764
- 'enableWebSearch': enable_web_search,
1765
- 'showSources': show_sources,
1766
- 'schema': schema,
1767
- 'origin': f'python-sdk@{get_version()}'
1768
- }
1769
-
1770
- # Only add prompt and systemPrompt if they exist
1771
- if prompt:
1772
- request_data['prompt'] = prompt
1773
- if system_prompt:
1774
- request_data['systemPrompt'] = system_prompt
1775
-
1776
- if agent:
1777
- request_data['agent'] = agent
1778
-
1779
- try:
1780
- # Send the initial extract request
1781
- response = self._post_request(
1782
- f'{self.api_url}/v1/extract',
1783
- request_data,
1784
- headers
1785
- )
1786
- if response.status_code == 200:
1787
- try:
1788
- data = response.json()
1789
- except:
1790
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1791
- if data['success']:
1792
- job_id = data.get('id')
1793
- if not job_id:
1794
- raise Exception('Job ID not returned from extract request.')
1795
-
1796
- # Poll for the extract status
1797
- while True:
1798
- status_response = self._get_request(
1799
- f'{self.api_url}/v1/extract/{job_id}',
1800
- headers
1801
- )
1802
- if status_response.status_code == 200:
1803
- try:
1804
- status_data = status_response.json()
1805
- except:
1806
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1807
- if status_data['status'] == 'completed':
1808
- return ExtractResponse(**status_data)
1809
- elif status_data['status'] in ['failed', 'cancelled']:
1810
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1811
- else:
1812
- self._handle_error(status_response, "extract-status")
1813
-
1814
- time.sleep(2) # Polling interval
1815
- else:
1816
- raise Exception(f'Failed to extract. Error: {data["error"]}')
1817
- else:
1818
- self._handle_error(response, "extract")
1819
- except Exception as e:
1820
- raise ValueError(str(e), 500)
1821
-
1822
- return ExtractResponse(success=False, error="Internal server error.")
1823
-
1824
- def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1825
- """
1826
- Retrieve the status of an extract job.
1827
-
1828
- Args:
1829
- job_id (str): The ID of the extract job.
1830
-
1831
- Returns:
1832
- ExtractResponse[Any]: The status of the extract job.
1833
-
1834
- Raises:
1835
- ValueError: If there is an error retrieving the status.
1836
- """
1837
- headers = self._prepare_headers()
1838
- try:
1839
- response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1840
- if response.status_code == 200:
1841
- try:
1842
- return ExtractResponse(**response.json())
1843
- except:
1844
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1845
- else:
1846
- self._handle_error(response, "get extract status")
1847
- except Exception as e:
1848
- raise ValueError(str(e), 500)
1849
-
1850
- def async_extract(
1851
- self,
1852
- urls: Optional[List[str]] = None,
1853
- *,
1854
- prompt: Optional[str] = None,
1855
- schema: Optional[Any] = None,
1856
- system_prompt: Optional[str] = None,
1857
- allow_external_links: Optional[bool] = False,
1858
- enable_web_search: Optional[bool] = False,
1859
- show_sources: Optional[bool] = False,
1860
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1861
- """
1862
- Initiate an asynchronous extract job.
1863
-
1864
- Args:
1865
- urls (List[str]): URLs to extract information from
1866
- prompt (Optional[str]): Custom extraction prompt
1867
- schema (Optional[Any]): JSON schema/Pydantic model
1868
- system_prompt (Optional[str]): System context
1869
- allow_external_links (Optional[bool]): Follow external links
1870
- enable_web_search (Optional[bool]): Enable web search
1871
- show_sources (Optional[bool]): Include source URLs
1872
- agent (Optional[Dict[str, Any]]): Agent configuration
1873
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1874
-
1875
- Returns:
1876
- ExtractResponse[Any] with:
1877
- * success (bool): Whether request succeeded
1878
- * data (Optional[Any]): Extracted data matching schema
1879
- * error (Optional[str]): Error message if any
1880
-
1881
- Raises:
1882
- ValueError: If job initiation fails
1883
- """
1884
- headers = self._prepare_headers()
1885
-
1886
- schema = schema
1887
- if schema:
1888
- schema = self._ensure_schema_dict(schema)
1889
-
1890
- request_data = {
1891
- 'urls': urls,
1892
- 'allowExternalLinks': allow_external_links,
1893
- 'enableWebSearch': enable_web_search,
1894
- 'showSources': show_sources,
1895
- 'schema': schema,
1896
- 'origin': f'python-sdk@{version}'
1897
- }
1898
-
1899
- if prompt:
1900
- request_data['prompt'] = prompt
1901
- if system_prompt:
1902
- request_data['systemPrompt'] = system_prompt
1903
- if agent:
1904
- request_data['agent'] = agent
1905
-
1906
- try:
1907
- response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1908
- if response.status_code == 200:
1909
- try:
1910
- return ExtractResponse(**response.json())
1911
- except:
1912
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1913
- else:
1914
- self._handle_error(response, "async extract")
1915
- except Exception as e:
1916
- raise ValueError(str(e), 500)
1917
-
1918
- def generate_llms_text(
1919
- self,
1920
- url: str,
1921
- *,
1922
- max_urls: Optional[int] = None,
1923
- show_full_text: Optional[bool] = None,
1924
- cache: Optional[bool] = None,
1925
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1926
- """
1927
- Generate LLMs.txt for a given URL and poll until completion.
1928
-
1929
- Args:
1930
- url (str): Target URL to generate LLMs.txt from
1931
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1932
- show_full_text (Optional[bool]): Include full text in output (default: False)
1933
- cache (Optional[bool]): Whether to use cached content if available (default: True)
1934
- experimental_stream (Optional[bool]): Enable experimental streaming
1935
-
1936
- Returns:
1937
- GenerateLLMsTextStatusResponse with:
1938
- * Generated LLMs.txt content
1939
- * Full version if requested
1940
- * Generation status
1941
- * Success/error information
1942
-
1943
- Raises:
1944
- Exception: If generation fails
1945
- """
1946
- params = GenerateLLMsTextParams(
1947
- maxUrls=max_urls,
1948
- showFullText=show_full_text,
1949
- cache=cache,
1950
- __experimental_stream=experimental_stream
1951
- )
1952
-
1953
- response = self.async_generate_llms_text(
1954
- url,
1955
- max_urls=max_urls,
1956
- show_full_text=show_full_text,
1957
- cache=cache,
1958
- experimental_stream=experimental_stream
1959
- )
1960
-
1961
- if not response.success or not response.id:
1962
- return GenerateLLMsTextStatusResponse(
1963
- success=False,
1964
- error='Failed to start LLMs.txt generation',
1965
- status='failed',
1966
- expiresAt=''
1967
- )
1968
-
1969
- job_id = response.id
1970
- while True:
1971
- status = self.check_generate_llms_text_status(job_id)
1972
-
1973
- if status.status == 'completed':
1974
- return status
1975
- elif status.status == 'failed':
1976
- return status
1977
- elif status.status != 'processing':
1978
- return GenerateLLMsTextStatusResponse(
1979
- success=False,
1980
- error='LLMs.txt generation job terminated unexpectedly',
1981
- status='failed',
1982
- expiresAt=''
1983
- )
1984
-
1985
- time.sleep(2) # Polling interval
1986
-
1987
- def async_generate_llms_text(
1988
- self,
1989
- url: str,
1990
- *,
1991
- max_urls: Optional[int] = None,
1992
- show_full_text: Optional[bool] = None,
1993
- cache: Optional[bool] = None,
1994
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1995
- """
1996
- Initiate an asynchronous LLMs.txt generation operation.
1997
-
1998
- Args:
1999
- url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2000
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
2001
- show_full_text (Optional[bool]): Include full text in output (default: False)
2002
- cache (Optional[bool]): Whether to use cached content if available (default: True)
2003
- experimental_stream (Optional[bool]): Enable experimental streaming
2004
-
2005
- Returns:
2006
- GenerateLLMsTextResponse: A response containing:
2007
- * success (bool): Whether the generation initiation was successful
2008
- * id (str): The unique identifier for the generation job
2009
- * error (str, optional): Error message if initiation failed
2010
-
2011
- Raises:
2012
- Exception: If the generation job initiation fails.
2013
- """
2014
- params = GenerateLLMsTextParams(
2015
- maxUrls=max_urls,
2016
- showFullText=show_full_text,
2017
- cache=cache,
2018
- __experimental_stream=experimental_stream
2019
- )
2020
-
2021
- headers = self._prepare_headers()
2022
- json_data = {'url': url, **params.dict(exclude_none=True)}
2023
- json_data['origin'] = f"python-sdk@{version}"
2024
-
2025
- try:
2026
- req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2027
- response = req.json()
2028
- print("json_data", json_data)
2029
- print("response", response)
2030
- if response.get('success'):
2031
- try:
2032
- return GenerateLLMsTextResponse(**response)
2033
- except:
2034
- raise Exception('Failed to parse Firecrawl response as JSON.')
2035
- else:
2036
- self._handle_error(response, 'start LLMs.txt generation')
2037
- except Exception as e:
2038
- raise ValueError(str(e))
2039
-
2040
- return GenerateLLMsTextResponse(
2041
- success=False,
2042
- error='Internal server error'
2043
- )
2044
-
2045
- def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2046
- """
2047
- Check the status of a LLMs.txt generation operation.
2048
-
2049
- Args:
2050
- id (str): The unique identifier of the LLMs.txt generation job to check status for.
2051
-
2052
- Returns:
2053
- GenerateLLMsTextStatusResponse: A response containing:
2054
- * success (bool): Whether the generation was successful
2055
- * status (str): Status of generation ("processing", "completed", "failed")
2056
- * data (Dict[str, str], optional): Generated text with fields:
2057
- * llmstxt (str): Generated LLMs.txt content
2058
- * llmsfulltxt (str, optional): Full version if requested
2059
- * error (str, optional): Error message if generation failed
2060
- * expiresAt (str): When the generated data expires
2061
-
2062
- Raises:
2063
- Exception: If the status check fails.
2064
- """
2065
- headers = self._prepare_headers()
2066
- try:
2067
- response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2068
- if response.status_code == 200:
2069
- try:
2070
- json_data = response.json()
2071
- return GenerateLLMsTextStatusResponse(**json_data)
2072
- except Exception as e:
2073
- raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2074
- elif response.status_code == 404:
2075
- raise Exception('LLMs.txt generation job not found')
2076
- else:
2077
- self._handle_error(response, 'check LLMs.txt generation status')
2078
- except Exception as e:
2079
- raise ValueError(str(e))
2080
-
2081
- return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2082
-
2083
- def _prepare_headers(
2084
- self,
2085
- idempotency_key: Optional[str] = None) -> Dict[str, str]:
2086
- """
2087
- Prepare the headers for API requests.
2088
-
2089
- Args:
2090
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2091
-
2092
- Returns:
2093
- Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2094
- """
2095
- if idempotency_key:
2096
- return {
2097
- 'Content-Type': 'application/json',
2098
- 'Authorization': f'Bearer {self.api_key}',
2099
- 'x-idempotency-key': idempotency_key
2100
- }
2101
-
2102
- return {
2103
- 'Content-Type': 'application/json',
2104
- 'Authorization': f'Bearer {self.api_key}',
2105
- }
2106
-
2107
- def _post_request(
2108
- self,
2109
- url: str,
2110
- data: Dict[str, Any],
2111
- headers: Dict[str, str],
2112
- retries: int = 3,
2113
- backoff_factor: float = 0.5) -> requests.Response:
2114
- """
2115
- Make a POST request with retries.
2116
-
2117
- Args:
2118
- url (str): The URL to send the POST request to.
2119
- data (Dict[str, Any]): The JSON data to include in the POST request.
2120
- headers (Dict[str, str]): The headers to include in the POST request.
2121
- retries (int): Number of retries for the request.
2122
- backoff_factor (float): Backoff factor for retries.
2123
-
2124
- Returns:
2125
- requests.Response: The response from the POST request.
2126
-
2127
- Raises:
2128
- requests.RequestException: If the request fails after the specified retries.
2129
- """
2130
- for attempt in range(retries):
2131
- response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2132
- if response.status_code == 502:
2133
- time.sleep(backoff_factor * (2 ** attempt))
2134
- else:
2135
- return response
2136
- return response
2137
-
2138
- def _get_request(
2139
- self,
2140
- url: str,
2141
- headers: Dict[str, str],
2142
- retries: int = 3,
2143
- backoff_factor: float = 0.5) -> requests.Response:
2144
- """
2145
- Make a GET request with retries.
2146
-
2147
- Args:
2148
- url (str): The URL to send the GET request to.
2149
- headers (Dict[str, str]): The headers to include in the GET request.
2150
- retries (int): Number of retries for the request.
2151
- backoff_factor (float): Backoff factor for retries.
2152
-
2153
- Returns:
2154
- requests.Response: The response from the GET request.
2155
-
2156
- Raises:
2157
- requests.RequestException: If the request fails after the specified retries.
2158
- """
2159
- for attempt in range(retries):
2160
- response = requests.get(url, headers=headers)
2161
- if response.status_code == 502:
2162
- time.sleep(backoff_factor * (2 ** attempt))
2163
- else:
2164
- return response
2165
- return response
2166
-
2167
- def _delete_request(
2168
- self,
2169
- url: str,
2170
- headers: Dict[str, str],
2171
- retries: int = 3,
2172
- backoff_factor: float = 0.5) -> requests.Response:
2173
- """
2174
- Make a DELETE request with retries.
2175
-
2176
- Args:
2177
- url (str): The URL to send the DELETE request to.
2178
- headers (Dict[str, str]): The headers to include in the DELETE request.
2179
- retries (int): Number of retries for the request.
2180
- backoff_factor (float): Backoff factor for retries.
2181
-
2182
- Returns:
2183
- requests.Response: The response from the DELETE request.
2184
-
2185
- Raises:
2186
- requests.RequestException: If the request fails after the specified retries.
2187
- """
2188
- for attempt in range(retries):
2189
- response = requests.delete(url, headers=headers)
2190
- if response.status_code == 502:
2191
- time.sleep(backoff_factor * (2 ** attempt))
2192
- else:
2193
- return response
2194
- return response
2195
-
2196
- def _monitor_job_status(
2197
- self,
2198
- id: str,
2199
- headers: Dict[str, str],
2200
- poll_interval: int) -> CrawlStatusResponse:
2201
- """
2202
- Monitor the status of a crawl job until completion.
2203
-
2204
- Args:
2205
- id (str): The ID of the crawl job.
2206
- headers (Dict[str, str]): The headers to include in the status check requests.
2207
- poll_interval (int): Seconds between status checks.
2208
-
2209
- Returns:
2210
- CrawlStatusResponse: The crawl results if the job is completed successfully.
2211
-
2212
- Raises:
2213
- Exception: If the job fails or an error occurs during status checks.
2214
- """
2215
- while True:
2216
- api_url = f'{self.api_url}/v1/crawl/{id}'
2217
-
2218
- status_response = self._get_request(api_url, headers)
2219
- if status_response.status_code == 200:
2220
- try:
2221
- status_data = status_response.json()
2222
- except:
2223
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2224
- if status_data['status'] == 'completed':
2225
- if 'data' in status_data:
2226
- data = status_data['data']
2227
- while 'next' in status_data:
2228
- if len(status_data['data']) == 0:
2229
- break
2230
- status_response = self._get_request(status_data['next'], headers)
2231
- try:
2232
- status_data = status_response.json()
2233
- except:
2234
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2235
- data.extend(status_data.get('data', []))
2236
- status_data['data'] = data
2237
- return CrawlStatusResponse(**status_data)
2238
- else:
2239
- raise Exception('Crawl job completed but no data was returned')
2240
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2241
- poll_interval=max(poll_interval,2)
2242
- time.sleep(poll_interval) # Wait for the specified interval before checking again
2243
- else:
2244
- raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2245
- else:
2246
- self._handle_error(status_response, 'check crawl status')
2247
-
2248
- def _handle_error(
2249
- self,
2250
- response: requests.Response,
2251
- action: str) -> None:
2252
- """
2253
- Handle errors from API responses.
2254
-
2255
- Args:
2256
- response (requests.Response): The response object from the API request.
2257
- action (str): Description of the action that was being performed.
2258
-
2259
- Raises:
2260
- Exception: An exception with a message containing the status code and error details from the response.
2261
- """
2262
- try:
2263
- error_message = response.json().get('error', 'No error message provided.')
2264
- error_details = response.json().get('details', 'No additional error details provided.')
2265
- except:
2266
- raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2267
-
2268
- message = self._get_error_message(response.status_code, action, error_message, error_details)
2269
-
2270
- # Raise an HTTPError with the custom message and attach the response
2271
- raise requests.exceptions.HTTPError(message, response=response)
2272
-
2273
- def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2274
- """
2275
- Generate a standardized error message based on HTTP status code.
2276
-
2277
- Args:
2278
- status_code (int): The HTTP status code from the response
2279
- action (str): Description of the action that was being performed
2280
- error_message (str): The error message from the API response
2281
- error_details (str): Additional error details from the API response
2282
-
2283
- Returns:
2284
- str: A formatted error message
2285
- """
2286
- if status_code == 402:
2287
- return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2288
- elif status_code == 403:
2289
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2290
- elif status_code == 408:
2291
- return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2292
- elif status_code == 409:
2293
- return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2294
- elif status_code == 500:
2295
- return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2296
- else:
2297
- return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2298
-
2299
- def deep_research(
2300
- self,
2301
- query: str,
2302
- *,
2303
- max_depth: Optional[int] = None,
2304
- time_limit: Optional[int] = None,
2305
- max_urls: Optional[int] = None,
2306
- analysis_prompt: Optional[str] = None,
2307
- system_prompt: Optional[str] = None,
2308
- __experimental_stream_steps: Optional[bool] = None,
2309
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2310
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2311
- """
2312
- Initiates a deep research operation on a given query and polls until completion.
2313
-
2314
- Args:
2315
- query (str): Research query or topic to investigate
2316
- max_depth (Optional[int]): Maximum depth of research exploration
2317
- time_limit (Optional[int]): Time limit in seconds for research
2318
- max_urls (Optional[int]): Maximum number of URLs to process
2319
- analysis_prompt (Optional[str]): Custom prompt for analysis
2320
- system_prompt (Optional[str]): Custom system prompt
2321
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2322
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2323
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2324
-
2325
- Returns:
2326
- DeepResearchStatusResponse containing:
2327
- * success (bool): Whether research completed successfully
2328
- * status (str): Current state (processing/completed/failed)
2329
- * error (Optional[str]): Error message if failed
2330
- * id (str): Unique identifier for the research job
2331
- * data (Any): Research findings and analysis
2332
- * sources (List[Dict]): List of discovered sources
2333
- * activities (List[Dict]): Research progress log
2334
- * summaries (List[str]): Generated research summaries
2335
-
2336
- Raises:
2337
- Exception: If research fails
2338
- """
2339
- research_params = {}
2340
- if max_depth is not None:
2341
- research_params['maxDepth'] = max_depth
2342
- if time_limit is not None:
2343
- research_params['timeLimit'] = time_limit
2344
- if max_urls is not None:
2345
- research_params['maxUrls'] = max_urls
2346
- if analysis_prompt is not None:
2347
- research_params['analysisPrompt'] = analysis_prompt
2348
- if system_prompt is not None:
2349
- research_params['systemPrompt'] = system_prompt
2350
- if __experimental_stream_steps is not None:
2351
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2352
- research_params = DeepResearchParams(**research_params)
2353
-
2354
- response = self.async_deep_research(
2355
- query,
2356
- max_depth=max_depth,
2357
- time_limit=time_limit,
2358
- max_urls=max_urls,
2359
- analysis_prompt=analysis_prompt,
2360
- system_prompt=system_prompt
2361
- )
2362
- if not response.get('success') or 'id' not in response:
2363
- return response
2364
-
2365
- job_id = response['id']
2366
- last_activity_count = 0
2367
- last_source_count = 0
2368
-
2369
- while True:
2370
- status = self.check_deep_research_status(job_id)
2371
-
2372
- if on_activity and 'activities' in status:
2373
- new_activities = status['activities'][last_activity_count:]
2374
- for activity in new_activities:
2375
- on_activity(activity)
2376
- last_activity_count = len(status['activities'])
2377
-
2378
- if on_source and 'sources' in status:
2379
- new_sources = status['sources'][last_source_count:]
2380
- for source in new_sources:
2381
- on_source(source)
2382
- last_source_count = len(status['sources'])
2383
-
2384
- if status['status'] == 'completed':
2385
- return status
2386
- elif status['status'] == 'failed':
2387
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
2388
- elif status['status'] != 'processing':
2389
- break
2390
-
2391
- time.sleep(2) # Polling interval
2392
-
2393
- return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2394
-
2395
- def async_deep_research(
2396
- self,
2397
- query: str,
2398
- *,
2399
- max_depth: Optional[int] = None,
2400
- time_limit: Optional[int] = None,
2401
- max_urls: Optional[int] = None,
2402
- analysis_prompt: Optional[str] = None,
2403
- system_prompt: Optional[str] = None,
2404
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2405
- """
2406
- Initiates an asynchronous deep research operation.
2407
-
2408
- Args:
2409
- query (str): Research query or topic to investigate
2410
- max_depth (Optional[int]): Maximum depth of research exploration
2411
- time_limit (Optional[int]): Time limit in seconds for research
2412
- max_urls (Optional[int]): Maximum number of URLs to process
2413
- analysis_prompt (Optional[str]): Custom prompt for analysis
2414
- system_prompt (Optional[str]): Custom system prompt
2415
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2416
-
2417
- Returns:
2418
- Dict[str, Any]: A response containing:
2419
- * success (bool): Whether the research initiation was successful
2420
- * id (str): The unique identifier for the research job
2421
- * error (str, optional): Error message if initiation failed
2422
-
2423
- Raises:
2424
- Exception: If the research initiation fails.
2425
- """
2426
- research_params = {}
2427
- if max_depth is not None:
2428
- research_params['maxDepth'] = max_depth
2429
- if time_limit is not None:
2430
- research_params['timeLimit'] = time_limit
2431
- if max_urls is not None:
2432
- research_params['maxUrls'] = max_urls
2433
- if analysis_prompt is not None:
2434
- research_params['analysisPrompt'] = analysis_prompt
2435
- if system_prompt is not None:
2436
- research_params['systemPrompt'] = system_prompt
2437
- if __experimental_stream_steps is not None:
2438
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2439
- research_params = DeepResearchParams(**research_params)
2440
-
2441
- headers = self._prepare_headers()
2442
-
2443
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
2444
- json_data['origin'] = f"python-sdk@{version}"
2445
-
2446
- # Handle json options schema if present
2447
- if 'jsonOptions' in json_data:
2448
- json_opts = json_data['jsonOptions']
2449
- if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2450
- json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2451
-
2452
- try:
2453
- response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2454
- if response.status_code == 200:
2455
- try:
2456
- return response.json()
2457
- except:
2458
- raise Exception('Failed to parse Firecrawl response as JSON.')
2459
- else:
2460
- self._handle_error(response, 'start deep research')
2461
- except Exception as e:
2462
- raise ValueError(str(e))
2463
-
2464
- return {'success': False, 'error': 'Internal server error'}
2465
-
2466
- def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2467
- """
2468
- Check the status of a deep research operation.
2469
-
2470
- Args:
2471
- id (str): The ID of the deep research operation.
2472
-
2473
- Returns:
2474
- DeepResearchResponse containing:
2475
-
2476
- Status:
2477
- * success - Whether research completed successfully
2478
- * status - Current state (processing/completed/failed)
2479
- * error - Error message if failed
2480
-
2481
- Results:
2482
- * id - Unique identifier for the research job
2483
- * data - Research findings and analysis
2484
- * sources - List of discovered sources
2485
- * activities - Research progress log
2486
- * summaries - Generated research summaries
2487
-
2488
- Raises:
2489
- Exception: If the status check fails.
2490
- """
2491
- headers = self._prepare_headers()
2492
- try:
2493
- response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2494
- if response.status_code == 200:
2495
- try:
2496
- return response.json()
2497
- except:
2498
- raise Exception('Failed to parse Firecrawl response as JSON.')
2499
- elif response.status_code == 404:
2500
- raise Exception('Deep research job not found')
2501
- else:
2502
- self._handle_error(response, 'check deep research status')
2503
- except Exception as e:
2504
- raise ValueError(str(e))
2505
-
2506
- return {'success': False, 'error': 'Internal server error'}
2507
-
2508
- def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2509
- """
2510
- Validate additional keyword arguments before they are passed to the API.
2511
- This provides early validation before the Pydantic model validation.
2512
-
2513
- Args:
2514
- kwargs (Dict[str, Any]): Additional keyword arguments to validate
2515
- method_name (str): Name of the method these kwargs are for
2516
-
2517
- Raises:
2518
- ValueError: If kwargs contain invalid or unsupported parameters
2519
- """
2520
- if not kwargs:
2521
- return
2522
-
2523
- # Known parameter mappings for each method
2524
- method_params = {
2525
- "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2526
- "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2527
- "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
2528
- "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2529
- "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2530
- "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2531
- "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2532
- "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2533
- "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2534
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2535
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2536
- "actions", "agent", "webhook"},
2537
- "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2538
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2539
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2540
- "actions", "agent", "webhook"},
2541
- "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2542
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2543
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2544
- "actions", "agent", "webhook"}
2545
- }
2546
-
2547
- # Get allowed parameters for this method
2548
- allowed_params = method_params.get(method_name, set())
2549
-
2550
- # Check for unknown parameters
2551
- unknown_params = set(kwargs.keys()) - allowed_params
2552
- if unknown_params:
2553
- raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2554
-
2555
- # Additional type validation can be added here if needed
2556
- # For now, we rely on Pydantic models for detailed type validation
2557
-
2558
- def _ensure_schema_dict(self, schema):
2559
- """
2560
- Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2561
- """
2562
- if schema is None:
2563
- return schema
2564
- if isinstance(schema, type):
2565
- # Pydantic v1/v2 model class
2566
- if hasattr(schema, 'model_json_schema'):
2567
- return schema.model_json_schema()
2568
- elif hasattr(schema, 'schema'):
2569
- return schema.schema()
2570
- if isinstance(schema, dict):
2571
- return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2572
- if isinstance(schema, (list, tuple)):
2573
- return [self._ensure_schema_dict(v) for v in schema]
2574
- return schema
2575
-
2576
- class CrawlWatcher:
2577
- """
2578
- A class to watch and handle crawl job events via WebSocket connection.
2579
-
2580
- Attributes:
2581
- id (str): The ID of the crawl job to watch
2582
- app (FirecrawlApp): The FirecrawlApp instance
2583
- data (List[Dict[str, Any]]): List of crawled documents/data
2584
- status (str): Current status of the crawl job
2585
- ws_url (str): WebSocket URL for the crawl job
2586
- event_handlers (dict): Dictionary of event type to list of handler functions
2587
- """
2588
- def __init__(self, id: str, app: FirecrawlApp):
2589
- self.id = id
2590
- self.app = app
2591
- self.data: List[Dict[str, Any]] = []
2592
- self.status = "scraping"
2593
- self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2594
- self.event_handlers = {
2595
- 'done': [],
2596
- 'error': [],
2597
- 'document': []
2598
- }
2599
-
2600
- async def connect(self) -> None:
2601
- """
2602
- Establishes WebSocket connection and starts listening for messages.
2603
- """
2604
- async with websockets.connect(
2605
- self.ws_url,
2606
- max_size=None,
2607
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2608
- ) as websocket:
2609
- await self._listen(websocket)
2610
-
2611
- async def _listen(self, websocket) -> None:
2612
- """
2613
- Listens for incoming WebSocket messages and handles them.
2614
-
2615
- Args:
2616
- websocket: The WebSocket connection object
2617
- """
2618
- async for message in websocket:
2619
- msg = json.loads(message)
2620
- await self._handle_message(msg)
2621
-
2622
- def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2623
- """
2624
- Adds an event handler function for a specific event type.
2625
-
2626
- Args:
2627
- event_type (str): Type of event to listen for ('done', 'error', or 'document')
2628
- handler (Callable): Function to handle the event
2629
- """
2630
- if event_type in self.event_handlers:
2631
- self.event_handlers[event_type].append(handler)
2632
-
2633
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2634
- """
2635
- Dispatches an event to all registered handlers for that event type.
2636
-
2637
- Args:
2638
- event_type (str): Type of event to dispatch
2639
- detail (Dict[str, Any]): Event details/data to pass to handlers
2640
- """
2641
- if event_type in self.event_handlers:
2642
- for handler in self.event_handlers[event_type]:
2643
- handler(detail)
2644
-
2645
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
2646
- """
2647
- Handles incoming WebSocket messages based on their type.
2648
-
2649
- Args:
2650
- msg (Dict[str, Any]): The message to handle
2651
- """
2652
- if msg['type'] == 'done':
2653
- self.status = 'completed'
2654
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2655
- elif msg['type'] == 'error':
2656
- self.status = 'failed'
2657
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2658
- elif msg['type'] == 'catchup':
2659
- self.status = msg['data']['status']
2660
- self.data.extend(msg['data'].get('data', []))
2661
- for doc in self.data:
2662
- self.dispatch_event('document', {'data': doc, 'id': self.id})
2663
- elif msg['type'] == 'document':
2664
- self.data.append(msg['data'])
2665
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2666
-
2667
- class AsyncFirecrawlApp(FirecrawlApp):
2668
- """
2669
- Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2670
- Provides non-blocking alternatives to all FirecrawlApp operations.
2671
- """
2672
-
2673
- async def _async_request(
2674
- self,
2675
- method: str,
2676
- url: str,
2677
- headers: Dict[str, str],
2678
- data: Optional[Dict[str, Any]] = None,
2679
- retries: int = 3,
2680
- backoff_factor: float = 0.5) -> Dict[str, Any]:
2681
- """
2682
- Generic async request method with exponential backoff retry logic.
2683
-
2684
- Args:
2685
- method (str): The HTTP method to use (e.g., "GET" or "POST").
2686
- url (str): The URL to send the request to.
2687
- headers (Dict[str, str]): Headers to include in the request.
2688
- data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2689
- retries (int): Maximum number of retry attempts (default: 3).
2690
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2691
- Delay will be backoff_factor * (2 ** retry_count).
2692
-
2693
- Returns:
2694
- Dict[str, Any]: The parsed JSON response from the server.
2695
-
2696
- Raises:
2697
- aiohttp.ClientError: If the request fails after all retries.
2698
- Exception: If max retries are exceeded or other errors occur.
2699
- """
2700
- async with aiohttp.ClientSession() as session:
2701
- for attempt in range(retries):
2702
- try:
2703
- async with session.request(
2704
- method=method, url=url, headers=headers, json=data
2705
- ) as response:
2706
- if response.status == 502:
2707
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2708
- continue
2709
- if response.status >= 300:
2710
- await self._handle_error(response, f"make {method} request")
2711
- return await response.json()
2712
- except aiohttp.ClientError as e:
2713
- if attempt == retries - 1:
2714
- raise e
2715
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2716
- raise Exception("Max retries exceeded")
2717
-
2718
- async def _async_post_request(
2719
- self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2720
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2721
- """
2722
- Make an async POST request with exponential backoff retry logic.
2723
-
2724
- Args:
2725
- url (str): The URL to send the POST request to.
2726
- data (Dict[str, Any]): The JSON data to include in the request body.
2727
- headers (Dict[str, str]): Headers to include in the request.
2728
- retries (int): Maximum number of retry attempts (default: 3).
2729
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2730
- Delay will be backoff_factor * (2 ** retry_count).
2731
-
2732
- Returns:
2733
- Dict[str, Any]: The parsed JSON response from the server.
2734
-
2735
- Raises:
2736
- aiohttp.ClientError: If the request fails after all retries.
2737
- Exception: If max retries are exceeded or other errors occur.
2738
- """
2739
- return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2740
-
2741
- async def _async_get_request(
2742
- self, url: str, headers: Dict[str, str],
2743
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2744
- """
2745
- Make an async GET request with exponential backoff retry logic.
2746
-
2747
- Args:
2748
- url (str): The URL to send the GET request to.
2749
- headers (Dict[str, str]): Headers to include in the request.
2750
- retries (int): Maximum number of retry attempts (default: 3).
2751
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2752
- Delay will be backoff_factor * (2 ** retry_count).
2753
-
2754
- Returns:
2755
- Dict[str, Any]: The parsed JSON response from the server.
2756
-
2757
- Raises:
2758
- aiohttp.ClientError: If the request fails after all retries.
2759
- Exception: If max retries are exceeded or other errors occur.
2760
- """
2761
- return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2762
-
2763
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2764
- """
2765
- Handle errors from async API responses with detailed error messages.
2766
-
2767
- Args:
2768
- response (aiohttp.ClientResponse): The response object from the failed request
2769
- action (str): Description of the action that was being attempted
2770
-
2771
- Raises:
2772
- aiohttp.ClientError: With a detailed error message based on the response status:
2773
- - 402: Payment Required
2774
- - 408: Request Timeout
2775
- - 409: Conflict
2776
- - 500: Internal Server Error
2777
- - Other: Unexpected error with status code
2778
- """
2779
- try:
2780
- error_data = await response.json()
2781
- error_message = error_data.get('error', 'No error message provided.')
2782
- error_details = error_data.get('details', 'No additional error details provided.')
2783
- except:
2784
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2785
-
2786
- message = await self._get_async_error_message(response.status, action, error_message, error_details)
2787
-
2788
- raise aiohttp.ClientError(message)
2789
-
2790
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2791
- """
2792
- Generate a standardized error message based on HTTP status code for async operations.
2793
-
2794
- Args:
2795
- status_code (int): The HTTP status code from the response
2796
- action (str): Description of the action that was being performed
2797
- error_message (str): The error message from the API response
2798
- error_details (str): Additional error details from the API response
2799
-
2800
- Returns:
2801
- str: A formatted error message
2802
- """
2803
- return self._get_error_message(status_code, action, error_message, error_details)
2804
-
2805
- async def crawl_url_and_watch(
2806
- self,
2807
- url: str,
2808
- params: Optional[CrawlParams] = None,
2809
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2810
- """
2811
- Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2812
-
2813
- Args:
2814
- url (str): Target URL to start crawling from
2815
- params (Optional[CrawlParams]): See CrawlParams model for configuration:
2816
- URL Discovery:
2817
- * includePaths - Patterns of URLs to include
2818
- * excludePaths - Patterns of URLs to exclude
2819
- * maxDepth - Maximum crawl depth
2820
- * maxDiscoveryDepth - Maximum depth for finding new URLs
2821
- * limit - Maximum pages to crawl
2822
-
2823
- Link Following:
2824
- * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2825
- * crawlEntireDomain - Follow parent directory links
2826
- * allowExternalLinks - Follow external domain links
2827
- * ignoreSitemap - Skip sitemap.xml processing
2828
-
2829
- Advanced:
2830
- * scrapeOptions - Page scraping configuration
2831
- * webhook - Notification webhook settings
2832
- * deduplicateSimilarURLs - Remove similar URLs
2833
- * ignoreQueryParameters - Ignore URL parameters
2834
- * regexOnFullURL - Apply regex to full URLs
2835
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2836
-
2837
- Returns:
2838
- AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2839
-
2840
- Raises:
2841
- Exception: If crawl job fails to start
2842
- """
2843
- crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2844
- if crawl_response.get('success') and 'id' in crawl_response:
2845
- return AsyncCrawlWatcher(crawl_response['id'], self)
2846
- else:
2847
- raise Exception("Crawl job failed to start")
2848
-
2849
- async def batch_scrape_urls_and_watch(
2850
- self,
2851
- urls: List[str],
2852
- params: Optional[ScrapeParams] = None,
2853
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2854
- """
2855
- Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2856
-
2857
- Args:
2858
- urls (List[str]): List of URLs to scrape
2859
- params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2860
-
2861
- Content Options:
2862
- * formats - Content formats to retrieve
2863
- * includeTags - HTML tags to include
2864
- * excludeTags - HTML tags to exclude
2865
- * onlyMainContent - Extract main content only
2866
-
2867
- Request Options:
2868
- * headers - Custom HTTP headers
2869
- * timeout - Request timeout (ms)
2870
- * mobile - Use mobile user agent
2871
- * proxy - Proxy type
2872
-
2873
- Extraction Options:
2874
- * extract - Content extraction config
2875
- * jsonOptions - JSON extraction config
2876
- * actions - Actions to perform
2877
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2878
-
2879
- Returns:
2880
- AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2881
-
2882
- Raises:
2883
- Exception: If batch scrape job fails to start
2884
- """
2885
- batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2886
- if batch_response.get('success') and 'id' in batch_response:
2887
- return AsyncCrawlWatcher(batch_response['id'], self)
2888
- else:
2889
- raise Exception("Batch scrape job failed to start")
2890
-
2891
- async def scrape_url(
2892
- self,
2893
- url: str,
2894
- *,
2895
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2896
- include_tags: Optional[List[str]] = None,
2897
- exclude_tags: Optional[List[str]] = None,
2898
- only_main_content: Optional[bool] = None,
2899
- wait_for: Optional[int] = None,
2900
- timeout: Optional[int] = None,
2901
- location: Optional[LocationConfig] = None,
2902
- mobile: Optional[bool] = None,
2903
- skip_tls_verification: Optional[bool] = None,
2904
- remove_base64_images: Optional[bool] = None,
2905
- block_ads: Optional[bool] = None,
2906
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2907
- extract: Optional[JsonConfig] = None,
2908
- json_options: Optional[JsonConfig] = None,
2909
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2910
- **kwargs) -> ScrapeResponse[Any]:
2911
- """
2912
- Scrape a single URL asynchronously.
2913
-
2914
- Args:
2915
- url (str): Target URL to scrape
2916
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2917
- include_tags (Optional[List[str]]): HTML tags to include
2918
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2919
- only_main_content (Optional[bool]): Extract main content only
2920
- wait_for (Optional[int]): Wait for a specific element to appear
2921
- timeout (Optional[int]): Request timeout (ms)
2922
- location (Optional[LocationConfig]): Location configuration
2923
- mobile (Optional[bool]): Use mobile user agent
2924
- skip_tls_verification (Optional[bool]): Skip TLS verification
2925
- remove_base64_images (Optional[bool]): Remove base64 images
2926
- block_ads (Optional[bool]): Block ads
2927
- proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
2928
- extract (Optional[JsonConfig]): Content extraction settings
2929
- json_options (Optional[JsonConfig]): JSON extraction settings
2930
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2931
- **kwargs: Additional parameters to pass to the API
2932
-
2933
- Returns:
2934
- ScrapeResponse with:
2935
- * success - Whether scrape was successful
2936
- * markdown - Markdown content if requested
2937
- * html - HTML content if requested
2938
- * rawHtml - Raw HTML content if requested
2939
- * links - Extracted links if requested
2940
- * screenshot - Screenshot if requested
2941
- * extract - Extracted data if requested
2942
- * json - JSON data if requested
2943
- * error - Error message if scrape failed
2944
-
2945
- Raises:
2946
- Exception: If scraping fails
2947
- """
2948
- # Validate any additional kwargs
2949
- self._validate_kwargs(kwargs, "scrape_url")
2950
-
2951
- headers = self._prepare_headers()
2952
-
2953
- # Build scrape parameters
2954
- scrape_params = {
2955
- 'url': url,
2956
- 'origin': f"python-sdk@{version}"
2957
- }
2958
-
2959
- # Add optional parameters if provided and not None
2960
- if formats:
2961
- scrape_params['formats'] = formats
2962
- if include_tags:
2963
- scrape_params['includeTags'] = include_tags
2964
- if exclude_tags:
2965
- scrape_params['excludeTags'] = exclude_tags
2966
- if only_main_content is not None:
2967
- scrape_params['onlyMainContent'] = only_main_content
2968
- if wait_for:
2969
- scrape_params['waitFor'] = wait_for
2970
- if timeout:
2971
- scrape_params['timeout'] = timeout
2972
- if location:
2973
- scrape_params['location'] = location.dict(exclude_none=True)
2974
- if mobile is not None:
2975
- scrape_params['mobile'] = mobile
2976
- if skip_tls_verification is not None:
2977
- scrape_params['skipTlsVerification'] = skip_tls_verification
2978
- if remove_base64_images is not None:
2979
- scrape_params['removeBase64Images'] = remove_base64_images
2980
- if block_ads is not None:
2981
- scrape_params['blockAds'] = block_ads
2982
- if proxy:
2983
- scrape_params['proxy'] = proxy
2984
- if extract is not None:
2985
- extract = self._ensure_schema_dict(extract)
2986
- if isinstance(extract, dict) and "schema" in extract:
2987
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
2988
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
2989
- if json_options is not None:
2990
- json_options = self._ensure_schema_dict(json_options)
2991
- if isinstance(json_options, dict) and "schema" in json_options:
2992
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
2993
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
2994
- if actions:
2995
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
2996
-
2997
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
2998
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
2999
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3000
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3001
-
3002
- # Make async request
3003
- endpoint = f'/v1/scrape'
3004
- response = await self._async_post_request(
3005
- f'{self.api_url}{endpoint}',
3006
- scrape_params,
3007
- headers
3008
- )
3009
-
3010
- if response.get('success') and 'data' in response:
3011
- return ScrapeResponse(**response['data'])
3012
- elif "error" in response:
3013
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3014
- else:
3015
- # Use the response content directly if possible, otherwise a generic message
3016
- error_content = response.get('error', str(response))
3017
- raise Exception(f'Failed to scrape URL. Error: {error_content}')
3018
-
3019
- async def batch_scrape_urls(
3020
- self,
3021
- urls: List[str],
3022
- *,
3023
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3024
- headers: Optional[Dict[str, str]] = None,
3025
- include_tags: Optional[List[str]] = None,
3026
- exclude_tags: Optional[List[str]] = None,
3027
- only_main_content: Optional[bool] = None,
3028
- wait_for: Optional[int] = None,
3029
- timeout: Optional[int] = None,
3030
- location: Optional[LocationConfig] = None,
3031
- mobile: Optional[bool] = None,
3032
- skip_tls_verification: Optional[bool] = None,
3033
- remove_base64_images: Optional[bool] = None,
3034
- block_ads: Optional[bool] = None,
3035
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3036
- extract: Optional[JsonConfig] = None,
3037
- json_options: Optional[JsonConfig] = None,
3038
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3039
- agent: Optional[AgentOptions] = None,
3040
- poll_interval: Optional[int] = 2,
3041
- idempotency_key: Optional[str] = None,
3042
- **kwargs
3043
- ) -> BatchScrapeStatusResponse:
3044
- """
3045
- Asynchronously scrape multiple URLs and monitor until completion.
3046
-
3047
- Args:
3048
- urls (List[str]): URLs to scrape
3049
- formats (Optional[List[Literal]]): Content formats to retrieve
3050
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3051
- include_tags (Optional[List[str]]): HTML tags to include
3052
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3053
- only_main_content (Optional[bool]): Extract main content only
3054
- wait_for (Optional[int]): Wait time in milliseconds
3055
- timeout (Optional[int]): Request timeout in milliseconds
3056
- location (Optional[LocationConfig]): Location configuration
3057
- mobile (Optional[bool]): Use mobile user agent
3058
- skip_tls_verification (Optional[bool]): Skip TLS verification
3059
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3060
- block_ads (Optional[bool]): Block advertisements
3061
- proxy (Optional[Literal]): Proxy type to use
3062
- extract (Optional[JsonConfig]): Content extraction config
3063
- json_options (Optional[JsonConfig]): JSON extraction config
3064
- actions (Optional[List[Union]]): Actions to perform
3065
- agent (Optional[AgentOptions]): Agent configuration
3066
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3067
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3068
- **kwargs: Additional parameters to pass to the API
3069
-
3070
- Returns:
3071
- BatchScrapeStatusResponse with:
3072
- * Scraping status and progress
3073
- * Scraped content for each URL
3074
- * Success/error information
3075
-
3076
- Raises:
3077
- Exception: If batch scrape fails
3078
- """
3079
- # Validate any additional kwargs
3080
- self._validate_kwargs(kwargs, "batch_scrape_urls")
3081
-
3082
- scrape_params = {}
3083
-
3084
- # Add individual parameters
3085
- if formats is not None:
3086
- scrape_params['formats'] = formats
3087
- if headers is not None:
3088
- scrape_params['headers'] = headers
3089
- if include_tags is not None:
3090
- scrape_params['includeTags'] = include_tags
3091
- if exclude_tags is not None:
3092
- scrape_params['excludeTags'] = exclude_tags
3093
- if only_main_content is not None:
3094
- scrape_params['onlyMainContent'] = only_main_content
3095
- if wait_for is not None:
3096
- scrape_params['waitFor'] = wait_for
3097
- if timeout is not None:
3098
- scrape_params['timeout'] = timeout
3099
- if location is not None:
3100
- scrape_params['location'] = location.dict(exclude_none=True)
3101
- if mobile is not None:
3102
- scrape_params['mobile'] = mobile
3103
- if skip_tls_verification is not None:
3104
- scrape_params['skipTlsVerification'] = skip_tls_verification
3105
- if remove_base64_images is not None:
3106
- scrape_params['removeBase64Images'] = remove_base64_images
3107
- if block_ads is not None:
3108
- scrape_params['blockAds'] = block_ads
3109
- if proxy is not None:
3110
- scrape_params['proxy'] = proxy
3111
- if extract is not None:
3112
- extract = self._ensure_schema_dict(extract)
3113
- if isinstance(extract, dict) and "schema" in extract:
3114
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3115
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3116
- if json_options is not None:
3117
- json_options = self._ensure_schema_dict(json_options)
3118
- if isinstance(json_options, dict) and "schema" in json_options:
3119
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3120
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3121
- if actions is not None:
3122
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3123
- if agent is not None:
3124
- scrape_params['agent'] = agent.dict(exclude_none=True)
3125
-
3126
- # Add any additional kwargs
3127
- scrape_params.update(kwargs)
3128
-
3129
- # Create final params object
3130
- final_params = ScrapeParams(**scrape_params)
3131
- params_dict = final_params.dict(exclude_none=True)
3132
- params_dict['urls'] = urls
3133
- params_dict['origin'] = f"python-sdk@{version}"
3134
-
3135
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3136
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3137
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3138
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3139
-
3140
- # Make request
3141
- headers = self._prepare_headers(idempotency_key)
3142
- response = await self._async_post_request(
3143
- f'{self.api_url}/v1/batch/scrape',
3144
- params_dict,
3145
- headers
3146
- )
3147
-
3148
- if response.get('success'):
3149
- try:
3150
- id = response.get('id')
3151
- except:
3152
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3153
- return await self._async_monitor_job_status(id, headers, poll_interval)
3154
- else:
3155
- self._handle_error(response, 'start batch scrape job')
3156
-
3157
-
3158
- async def async_batch_scrape_urls(
3159
- self,
3160
- urls: List[str],
3161
- *,
3162
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3163
- headers: Optional[Dict[str, str]] = None,
3164
- include_tags: Optional[List[str]] = None,
3165
- exclude_tags: Optional[List[str]] = None,
3166
- only_main_content: Optional[bool] = None,
3167
- wait_for: Optional[int] = None,
3168
- timeout: Optional[int] = None,
3169
- location: Optional[LocationConfig] = None,
3170
- mobile: Optional[bool] = None,
3171
- skip_tls_verification: Optional[bool] = None,
3172
- remove_base64_images: Optional[bool] = None,
3173
- block_ads: Optional[bool] = None,
3174
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3175
- extract: Optional[JsonConfig] = None,
3176
- json_options: Optional[JsonConfig] = None,
3177
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3178
- agent: Optional[AgentOptions] = None,
3179
- idempotency_key: Optional[str] = None,
3180
- **kwargs
3181
- ) -> BatchScrapeResponse:
3182
- """
3183
- Initiate a batch scrape job asynchronously.
3184
-
3185
- Args:
3186
- urls (List[str]): URLs to scrape
3187
- formats (Optional[List[Literal]]): Content formats to retrieve
3188
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3189
- include_tags (Optional[List[str]]): HTML tags to include
3190
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3191
- only_main_content (Optional[bool]): Extract main content only
3192
- wait_for (Optional[int]): Wait time in milliseconds
3193
- timeout (Optional[int]): Request timeout in milliseconds
3194
- location (Optional[LocationConfig]): Location configuration
3195
- mobile (Optional[bool]): Use mobile user agent
3196
- skip_tls_verification (Optional[bool]): Skip TLS verification
3197
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3198
- block_ads (Optional[bool]): Block advertisements
3199
- proxy (Optional[Literal]): Proxy type to use
3200
- extract (Optional[JsonConfig]): Content extraction config
3201
- json_options (Optional[JsonConfig]): JSON extraction config
3202
- actions (Optional[List[Union]]): Actions to perform
3203
- agent (Optional[AgentOptions]): Agent configuration
3204
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3205
- **kwargs: Additional parameters to pass to the API
3206
-
3207
- Returns:
3208
- BatchScrapeResponse with:
3209
- * success - Whether job started successfully
3210
- * id - Unique identifier for the job
3211
- * url - Status check URL
3212
- * error - Error message if start failed
3213
-
3214
- Raises:
3215
- Exception: If job initiation fails
3216
- """
3217
- # Validate any additional kwargs
3218
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3219
-
3220
- scrape_params = {}
3221
-
3222
- # Add individual parameters
3223
- if formats is not None:
3224
- scrape_params['formats'] = formats
3225
- if headers is not None:
3226
- scrape_params['headers'] = headers
3227
- if include_tags is not None:
3228
- scrape_params['includeTags'] = include_tags
3229
- if exclude_tags is not None:
3230
- scrape_params['excludeTags'] = exclude_tags
3231
- if only_main_content is not None:
3232
- scrape_params['onlyMainContent'] = only_main_content
3233
- if wait_for is not None:
3234
- scrape_params['waitFor'] = wait_for
3235
- if timeout is not None:
3236
- scrape_params['timeout'] = timeout
3237
- if location is not None:
3238
- scrape_params['location'] = location.dict(exclude_none=True)
3239
- if mobile is not None:
3240
- scrape_params['mobile'] = mobile
3241
- if skip_tls_verification is not None:
3242
- scrape_params['skipTlsVerification'] = skip_tls_verification
3243
- if remove_base64_images is not None:
3244
- scrape_params['removeBase64Images'] = remove_base64_images
3245
- if block_ads is not None:
3246
- scrape_params['blockAds'] = block_ads
3247
- if proxy is not None:
3248
- scrape_params['proxy'] = proxy
3249
- if extract is not None:
3250
- extract = self._ensure_schema_dict(extract)
3251
- if isinstance(extract, dict) and "schema" in extract:
3252
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3253
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3254
- if json_options is not None:
3255
- json_options = self._ensure_schema_dict(json_options)
3256
- if isinstance(json_options, dict) and "schema" in json_options:
3257
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3258
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3259
- if actions is not None:
3260
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3261
- if agent is not None:
3262
- scrape_params['agent'] = agent.dict(exclude_none=True)
3263
-
3264
- # Add any additional kwargs
3265
- scrape_params.update(kwargs)
3266
-
3267
- # Create final params object
3268
- final_params = ScrapeParams(**scrape_params)
3269
- params_dict = final_params.dict(exclude_none=True)
3270
- params_dict['urls'] = urls
3271
- params_dict['origin'] = f"python-sdk@{version}"
3272
-
3273
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3274
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3275
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3276
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3277
-
3278
- # Make request
3279
- headers = self._prepare_headers(idempotency_key)
3280
- response = await self._async_post_request(
3281
- f'{self.api_url}/v1/batch/scrape',
3282
- params_dict,
3283
- headers
3284
- )
3285
-
3286
- if response.get('status_code') == 200:
3287
- try:
3288
- return BatchScrapeResponse(**response.json())
3289
- except:
3290
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3291
- else:
3292
- self._handle_error(response, 'start batch scrape job')
3293
-
3294
- async def crawl_url(
3295
- self,
3296
- url: str,
3297
- *,
3298
- include_paths: Optional[List[str]] = None,
3299
- exclude_paths: Optional[List[str]] = None,
3300
- max_depth: Optional[int] = None,
3301
- max_discovery_depth: Optional[int] = None,
3302
- limit: Optional[int] = None,
3303
- allow_backward_links: Optional[bool] = None,
3304
- crawl_entire_domain: Optional[bool] = None,
3305
- allow_external_links: Optional[bool] = None,
3306
- ignore_sitemap: Optional[bool] = None,
3307
- scrape_options: Optional[ScrapeOptions] = None,
3308
- webhook: Optional[Union[str, WebhookConfig]] = None,
3309
- deduplicate_similar_urls: Optional[bool] = None,
3310
- ignore_query_parameters: Optional[bool] = None,
3311
- regex_on_full_url: Optional[bool] = None,
3312
- delay: Optional[int] = None,
3313
- poll_interval: Optional[int] = 2,
3314
- idempotency_key: Optional[str] = None,
3315
- **kwargs
3316
- ) -> CrawlStatusResponse:
3317
- """
3318
- Crawl a website starting from a URL.
3319
-
3320
- Args:
3321
- url (str): Target URL to start crawling from
3322
- include_paths (Optional[List[str]]): Patterns of URLs to include
3323
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3324
- max_depth (Optional[int]): Maximum crawl depth
3325
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3326
- limit (Optional[int]): Maximum pages to crawl
3327
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3328
- crawl_entire_domain (Optional[bool]): Follow parent directory links
3329
- allow_external_links (Optional[bool]): Follow external domain links
3330
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3331
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3332
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3333
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3334
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3335
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3336
- delay (Optional[int]): Delay in seconds between scrapes
3337
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3338
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3339
- **kwargs: Additional parameters to pass to the API
3340
-
3341
- Returns:
3342
- CrawlStatusResponse with:
3343
- * Crawling status and progress
3344
- * Crawled page contents
3345
- * Success/error information
3346
-
3347
- Raises:
3348
- Exception: If crawl fails
3349
- """
3350
- # Validate any additional kwargs
3351
- self._validate_kwargs(kwargs, "crawl_url")
3352
-
3353
- crawl_params = {}
3354
-
3355
- # Add individual parameters
3356
- if include_paths is not None:
3357
- crawl_params['includePaths'] = include_paths
3358
- if exclude_paths is not None:
3359
- crawl_params['excludePaths'] = exclude_paths
3360
- if max_depth is not None:
3361
- crawl_params['maxDepth'] = max_depth
3362
- if max_discovery_depth is not None:
3363
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3364
- if limit is not None:
3365
- crawl_params['limit'] = limit
3366
- if crawl_entire_domain is not None:
3367
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
3368
- elif allow_backward_links is not None:
3369
- crawl_params['allowBackwardLinks'] = allow_backward_links
3370
- if allow_external_links is not None:
3371
- crawl_params['allowExternalLinks'] = allow_external_links
3372
- if ignore_sitemap is not None:
3373
- crawl_params['ignoreSitemap'] = ignore_sitemap
3374
- if scrape_options is not None:
3375
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3376
- if webhook is not None:
3377
- crawl_params['webhook'] = webhook
3378
- if deduplicate_similar_urls is not None:
3379
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3380
- if ignore_query_parameters is not None:
3381
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3382
- if regex_on_full_url is not None:
3383
- crawl_params['regexOnFullURL'] = regex_on_full_url
3384
- if delay is not None:
3385
- crawl_params['delay'] = delay
3386
-
3387
- # Add any additional kwargs
3388
- crawl_params.update(kwargs)
3389
-
3390
- # Create final params object
3391
- final_params = CrawlParams(**crawl_params)
3392
- params_dict = final_params.dict(exclude_none=True)
3393
- params_dict['url'] = url
3394
- params_dict['origin'] = f"python-sdk@{version}"
3395
- # Make request
3396
- headers = self._prepare_headers(idempotency_key)
3397
- response = await self._async_post_request(
3398
- f'{self.api_url}/v1/crawl', params_dict, headers)
3399
-
3400
- if response.get('success'):
3401
- try:
3402
- id = response.get('id')
3403
- except:
3404
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3405
- return await self._async_monitor_job_status(id, headers, poll_interval)
3406
- else:
3407
- self._handle_error(response, 'start crawl job')
3408
-
3409
-
3410
- async def async_crawl_url(
3411
- self,
3412
- url: str,
3413
- *,
3414
- include_paths: Optional[List[str]] = None,
3415
- exclude_paths: Optional[List[str]] = None,
3416
- max_depth: Optional[int] = None,
3417
- max_discovery_depth: Optional[int] = None,
3418
- limit: Optional[int] = None,
3419
- allow_backward_links: Optional[bool] = None,
3420
- crawl_entire_domain: Optional[bool] = None,
3421
- allow_external_links: Optional[bool] = None,
3422
- ignore_sitemap: Optional[bool] = None,
3423
- scrape_options: Optional[ScrapeOptions] = None,
3424
- webhook: Optional[Union[str, WebhookConfig]] = None,
3425
- deduplicate_similar_urls: Optional[bool] = None,
3426
- ignore_query_parameters: Optional[bool] = None,
3427
- regex_on_full_url: Optional[bool] = None,
3428
- delay: Optional[int] = None,
3429
- poll_interval: Optional[int] = 2,
3430
- idempotency_key: Optional[str] = None,
3431
- **kwargs
3432
- ) -> CrawlResponse:
3433
- """
3434
- Start an asynchronous crawl job.
3435
-
3436
- Args:
3437
- url (str): Target URL to start crawling from
3438
- include_paths (Optional[List[str]]): Patterns of URLs to include
3439
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3440
- max_depth (Optional[int]): Maximum crawl depth
3441
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3442
- limit (Optional[int]): Maximum pages to crawl
3443
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3444
- crawl_entire_domain (Optional[bool]): Follow parent directory links
3445
- allow_external_links (Optional[bool]): Follow external domain links
3446
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3447
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3448
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3449
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3450
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3451
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3452
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3453
- **kwargs: Additional parameters to pass to the API
3454
-
3455
- Returns:
3456
- CrawlResponse with:
3457
- * success - Whether crawl started successfully
3458
- * id - Unique identifier for the crawl job
3459
- * url - Status check URL for the crawl
3460
- * error - Error message if start failed
3461
-
3462
- Raises:
3463
- Exception: If crawl initiation fails
3464
- """
3465
- crawl_params = {}
3466
-
3467
- # Add individual parameters
3468
- if include_paths is not None:
3469
- crawl_params['includePaths'] = include_paths
3470
- if exclude_paths is not None:
3471
- crawl_params['excludePaths'] = exclude_paths
3472
- if max_depth is not None:
3473
- crawl_params['maxDepth'] = max_depth
3474
- if max_discovery_depth is not None:
3475
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3476
- if limit is not None:
3477
- crawl_params['limit'] = limit
3478
- if crawl_entire_domain is not None:
3479
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
3480
- elif allow_backward_links is not None:
3481
- crawl_params['allowBackwardLinks'] = allow_backward_links
3482
- if allow_external_links is not None:
3483
- crawl_params['allowExternalLinks'] = allow_external_links
3484
- if ignore_sitemap is not None:
3485
- crawl_params['ignoreSitemap'] = ignore_sitemap
3486
- if scrape_options is not None:
3487
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3488
- if webhook is not None:
3489
- crawl_params['webhook'] = webhook
3490
- if deduplicate_similar_urls is not None:
3491
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3492
- if ignore_query_parameters is not None:
3493
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3494
- if regex_on_full_url is not None:
3495
- crawl_params['regexOnFullURL'] = regex_on_full_url
3496
- if delay is not None:
3497
- crawl_params['delay'] = delay
3498
-
3499
- # Add any additional kwargs
3500
- crawl_params.update(kwargs)
3501
-
3502
- # Create final params object
3503
- final_params = CrawlParams(**crawl_params)
3504
- params_dict = final_params.dict(exclude_none=True)
3505
- params_dict['url'] = url
3506
- params_dict['origin'] = f"python-sdk@{version}"
3507
-
3508
- # Make request
3509
- headers = self._prepare_headers(idempotency_key)
3510
- response = await self._async_post_request(
3511
- f'{self.api_url}/v1/crawl',
3512
- params_dict,
3513
- headers
3514
- )
3515
-
3516
- if response.get('success'):
3517
- try:
3518
- return CrawlResponse(**response)
3519
- except:
3520
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3521
- else:
3522
- self._handle_error(response, 'start crawl job')
3523
-
3524
- async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3525
- """
3526
- Check the status and results of an asynchronous crawl job.
3527
-
3528
- Args:
3529
- id (str): Unique identifier for the crawl job
3530
-
3531
- Returns:
3532
- CrawlStatusResponse containing:
3533
- Status Information:
3534
- * status - Current state (scraping/completed/failed/cancelled)
3535
- * completed - Number of pages crawled
3536
- * total - Total pages to crawl
3537
- * creditsUsed - API credits consumed
3538
- * expiresAt - Data expiration timestamp
3539
-
3540
- Results:
3541
- * data - List of crawled documents
3542
- * next - URL for next page of results (if paginated)
3543
- * success - Whether status check succeeded
3544
- * error - Error message if failed
3545
-
3546
- Raises:
3547
- Exception: If status check fails
3548
- """
3549
- headers = self._prepare_headers()
3550
- endpoint = f'/v1/crawl/{id}'
3551
-
3552
- status_data = await self._async_get_request(
3553
- f'{self.api_url}{endpoint}',
3554
- headers
3555
- )
3556
-
3557
- if status_data.get('status') == 'completed':
3558
- if 'data' in status_data:
3559
- data = status_data['data']
3560
- while 'next' in status_data:
3561
- if len(status_data['data']) == 0:
3562
- break
3563
- next_url = status_data.get('next')
3564
- if not next_url:
3565
- logger.warning("Expected 'next' URL is missing.")
3566
- break
3567
- next_data = await self._async_get_request(next_url, headers)
3568
- data.extend(next_data.get('data', []))
3569
- status_data = next_data
3570
- status_data['data'] = data
3571
- # Create CrawlStatusResponse object from status data
3572
- response = CrawlStatusResponse(
3573
- status=status_data.get('status'),
3574
- total=status_data.get('total'),
3575
- completed=status_data.get('completed'),
3576
- creditsUsed=status_data.get('creditsUsed'),
3577
- expiresAt=status_data.get('expiresAt'),
3578
- data=status_data.get('data'),
3579
- success=False if 'error' in status_data else True
3580
- )
3581
-
3582
- if 'error' in status_data:
3583
- response.error = status_data.get('error')
3584
-
3585
- if 'next' in status_data:
3586
- response.next = status_data.get('next')
3587
-
3588
- return response
3589
-
3590
- async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3591
- """
3592
- Monitor the status of an asynchronous job until completion.
3593
-
3594
- Args:
3595
- id (str): The ID of the job to monitor
3596
- headers (Dict[str, str]): Headers to include in status check requests
3597
- poll_interval (int): Seconds between status checks (default: 2)
3598
-
3599
- Returns:
3600
- CrawlStatusResponse: The job results if completed successfully
3601
-
3602
- Raises:
3603
- Exception: If the job fails or an error occurs during status checks
3604
- """
3605
- while True:
3606
- status_data = await self._async_get_request(
3607
- f'{self.api_url}/v1/crawl/{id}',
3608
- headers
3609
- )
3610
-
3611
- if status_data.get('status') == 'completed':
3612
- if 'data' in status_data:
3613
- data = status_data['data']
3614
- while 'next' in status_data:
3615
- if len(status_data['data']) == 0:
3616
- break
3617
- next_url = status_data.get('next')
3618
- if not next_url:
3619
- logger.warning("Expected 'next' URL is missing.")
3620
- break
3621
- next_data = await self._async_get_request(next_url, headers)
3622
- data.extend(next_data.get('data', []))
3623
- status_data = next_data
3624
- status_data['data'] = data
3625
- return CrawlStatusResponse(**status_data)
3626
- else:
3627
- raise Exception('Job completed but no data was returned')
3628
- elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3629
- await asyncio.sleep(max(poll_interval, 2))
3630
- else:
3631
- raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3632
-
3633
- async def map_url(
3634
- self,
3635
- url: str,
3636
- *,
3637
- search: Optional[str] = None,
3638
- ignore_sitemap: Optional[bool] = None,
3639
- include_subdomains: Optional[bool] = None,
3640
- sitemap_only: Optional[bool] = None,
3641
- limit: Optional[int] = None,
3642
- timeout: Optional[int] = None,
3643
- params: Optional[MapParams] = None) -> MapResponse:
3644
- """
3645
- Asynchronously map and discover links from a URL.
3646
-
3647
- Args:
3648
- url (str): Target URL to map
3649
- params (Optional[MapParams]): See MapParams model:
3650
- Discovery Options:
3651
- * search - Filter pattern for URLs
3652
- * ignoreSitemap - Skip sitemap.xml
3653
- * includeSubdomains - Include subdomain links
3654
- * sitemapOnly - Only use sitemap.xml
3655
-
3656
- Limits:
3657
- * limit - Max URLs to return
3658
- * timeout - Request timeout (ms)
3659
-
3660
- Returns:
3661
- MapResponse with:
3662
- * Discovered URLs
3663
- * Success/error status
3664
-
3665
- Raises:
3666
- Exception: If mapping fails
3667
- """
3668
- map_params = {}
3669
- if params:
3670
- map_params.update(params.dict(exclude_none=True))
3671
-
3672
- # Add individual parameters
3673
- if search is not None:
3674
- map_params['search'] = search
3675
- if ignore_sitemap is not None:
3676
- map_params['ignoreSitemap'] = ignore_sitemap
3677
- if include_subdomains is not None:
3678
- map_params['includeSubdomains'] = include_subdomains
3679
- if sitemap_only is not None:
3680
- map_params['sitemapOnly'] = sitemap_only
3681
- if limit is not None:
3682
- map_params['limit'] = limit
3683
- if timeout is not None:
3684
- map_params['timeout'] = timeout
3685
-
3686
- # Create final params object
3687
- final_params = MapParams(**map_params)
3688
- params_dict = final_params.dict(exclude_none=True)
3689
- params_dict['url'] = url
3690
- params_dict['origin'] = f"python-sdk@{version}"
3691
-
3692
- # Make request
3693
- endpoint = f'/v1/map'
3694
- response = await self._async_post_request(
3695
- f'{self.api_url}{endpoint}',
3696
- params_dict,
3697
- headers={"Authorization": f"Bearer {self.api_key}"}
3698
- )
3699
-
3700
- if response.get('success') and 'links' in response:
3701
- return MapResponse(**response)
3702
- elif 'error' in response:
3703
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
3704
- else:
3705
- raise Exception(f'Failed to map URL. Error: {response}')
3706
-
3707
- async def extract(
3708
- self,
3709
- urls: Optional[List[str]] = None,
3710
- *,
3711
- prompt: Optional[str] = None,
3712
- schema: Optional[Any] = None,
3713
- system_prompt: Optional[str] = None,
3714
- allow_external_links: Optional[bool] = False,
3715
- enable_web_search: Optional[bool] = False,
3716
- show_sources: Optional[bool] = False,
3717
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3718
-
3719
- """
3720
- Asynchronously extract structured information from URLs.
3721
-
3722
- Args:
3723
- urls (Optional[List[str]]): URLs to extract from
3724
- prompt (Optional[str]): Custom extraction prompt
3725
- schema (Optional[Any]): JSON schema/Pydantic model
3726
- system_prompt (Optional[str]): System context
3727
- allow_external_links (Optional[bool]): Follow external links
3728
- enable_web_search (Optional[bool]): Enable web search
3729
- show_sources (Optional[bool]): Include source URLs
3730
- agent (Optional[Dict[str, Any]]): Agent configuration
3731
-
3732
- Returns:
3733
- ExtractResponse with:
3734
- * Structured data matching schema
3735
- * Source information if requested
3736
- * Success/error status
3737
-
3738
- Raises:
3739
- ValueError: If prompt/schema missing or extraction fails
3740
- """
3741
- headers = self._prepare_headers()
3742
-
3743
- if not prompt and not schema:
3744
- raise ValueError("Either prompt or schema is required")
3745
-
3746
- if not urls and not prompt:
3747
- raise ValueError("Either urls or prompt is required")
3748
-
3749
- if schema:
3750
- schema = self._ensure_schema_dict(schema)
3751
-
3752
- request_data = {
3753
- 'urls': urls or [],
3754
- 'allowExternalLinks': allow_external_links,
3755
- 'enableWebSearch': enable_web_search,
3756
- 'showSources': show_sources,
3757
- 'schema': schema,
3758
- 'origin': f'python-sdk@{get_version()}'
3759
- }
3760
-
3761
- # Only add prompt and systemPrompt if they exist
3762
- if prompt:
3763
- request_data['prompt'] = prompt
3764
- if system_prompt:
3765
- request_data['systemPrompt'] = system_prompt
3766
-
3767
- if agent:
3768
- request_data['agent'] = agent
3769
-
3770
- response = await self._async_post_request(
3771
- f'{self.api_url}/v1/extract',
3772
- request_data,
3773
- headers
3774
- )
3775
-
3776
- if response.get('success'):
3777
- job_id = response.get('id')
3778
- if not job_id:
3779
- raise Exception('Job ID not returned from extract request.')
3780
-
3781
- while True:
3782
- status_data = await self._async_get_request(
3783
- f'{self.api_url}/v1/extract/{job_id}',
3784
- headers
3785
- )
3786
-
3787
- if status_data['status'] == 'completed':
3788
- return ExtractResponse(**status_data)
3789
- elif status_data['status'] in ['failed', 'cancelled']:
3790
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3791
-
3792
- await asyncio.sleep(2)
3793
- else:
3794
- raise Exception(f'Failed to extract. Error: {response.get("error")}')
3795
-
3796
- async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3797
- """
3798
- Check the status of an asynchronous batch scrape job.
3799
-
3800
- Args:
3801
- id (str): The ID of the batch scrape job
3802
-
3803
- Returns:
3804
- BatchScrapeStatusResponse containing:
3805
- Status Information:
3806
- * status - Current state (scraping/completed/failed/cancelled)
3807
- * completed - Number of URLs scraped
3808
- * total - Total URLs to scrape
3809
- * creditsUsed - API credits consumed
3810
- * expiresAt - Data expiration timestamp
3811
-
3812
- Results:
3813
- * data - List of scraped documents
3814
- * next - URL for next page of results (if paginated)
3815
- * success - Whether status check succeeded
3816
- * error - Error message if failed
3817
-
3818
- Raises:
3819
- Exception: If status check fails
3820
- """
3821
- headers = self._prepare_headers()
3822
- endpoint = f'/v1/batch/scrape/{id}'
3823
-
3824
- status_data = await self._async_get_request(
3825
- f'{self.api_url}{endpoint}',
3826
- headers
3827
- )
3828
-
3829
- if status_data['status'] == 'completed':
3830
- if 'data' in status_data:
3831
- data = status_data['data']
3832
- while 'next' in status_data:
3833
- if len(status_data['data']) == 0:
3834
- break
3835
- next_url = status_data.get('next')
3836
- if not next_url:
3837
- logger.warning("Expected 'next' URL is missing.")
3838
- break
3839
- next_data = await self._async_get_request(next_url, headers)
3840
- data.extend(next_data.get('data', []))
3841
- status_data = next_data
3842
- status_data['data'] = data
3843
-
3844
- response = BatchScrapeStatusResponse(
3845
- status=status_data.get('status'),
3846
- total=status_data.get('total'),
3847
- completed=status_data.get('completed'),
3848
- creditsUsed=status_data.get('creditsUsed'),
3849
- expiresAt=status_data.get('expiresAt'),
3850
- data=status_data.get('data')
3851
- )
3852
-
3853
- if 'error' in status_data:
3854
- response['error'] = status_data['error']
3855
-
3856
- if 'next' in status_data:
3857
- response['next'] = status_data['next']
3858
-
3859
- return {
3860
- 'success': False if 'error' in status_data else True,
3861
- **response
3862
- }
3863
-
3864
- async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3865
- """
3866
- Get information about errors from an asynchronous batch scrape job.
3867
-
3868
- Args:
3869
- id (str): The ID of the batch scrape job
3870
-
3871
- Returns:
3872
- CrawlErrorsResponse containing:
3873
- errors (List[Dict[str, str]]): List of errors with fields:
3874
- * id (str): Error ID
3875
- * timestamp (str): When the error occurred
3876
- * url (str): URL that caused the error
3877
- * error (str): Error message
3878
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3879
-
3880
- Raises:
3881
- Exception: If error check fails
3882
- """
3883
- headers = self._prepare_headers()
3884
- return await self._async_get_request(
3885
- f'{self.api_url}/v1/batch/scrape/{id}/errors',
3886
- headers
3887
- )
3888
-
3889
- async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3890
- """
3891
- Get information about errors from an asynchronous crawl job.
3892
-
3893
- Args:
3894
- id (str): The ID of the crawl job
3895
-
3896
- Returns:
3897
- CrawlErrorsResponse containing:
3898
- * errors (List[Dict[str, str]]): List of errors with fields:
3899
- - id (str): Error ID
3900
- - timestamp (str): When the error occurred
3901
- - url (str): URL that caused the error
3902
- - error (str): Error message
3903
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3904
-
3905
- Raises:
3906
- Exception: If error check fails
3907
- """
3908
- headers = self._prepare_headers()
3909
- return await self._async_get_request(
3910
- f'{self.api_url}/v1/crawl/{id}/errors',
3911
- headers
3912
- )
3913
-
3914
- async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3915
- """
3916
- Cancel an asynchronous crawl job.
3917
-
3918
- Args:
3919
- id (str): The ID of the crawl job to cancel
3920
-
3921
- Returns:
3922
- Dict[str, Any] containing:
3923
- * success (bool): Whether cancellation was successful
3924
- * error (str, optional): Error message if cancellation failed
3925
-
3926
- Raises:
3927
- Exception: If cancellation fails
3928
- """
3929
- headers = self._prepare_headers()
3930
- async with aiohttp.ClientSession() as session:
3931
- async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3932
- return await response.json()
3933
-
3934
- async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3935
- """
3936
- Check the status of an asynchronous extraction job.
3937
-
3938
- Args:
3939
- job_id (str): The ID of the extraction job
3940
-
3941
- Returns:
3942
- ExtractResponse[Any] with:
3943
- * success (bool): Whether request succeeded
3944
- * data (Optional[Any]): Extracted data matching schema
3945
- * error (Optional[str]): Error message if any
3946
- * warning (Optional[str]): Warning message if any
3947
- * sources (Optional[List[str]]): Source URLs if requested
3948
-
3949
- Raises:
3950
- ValueError: If status check fails
3951
- """
3952
- headers = self._prepare_headers()
3953
- try:
3954
- return await self._async_get_request(
3955
- f'{self.api_url}/v1/extract/{job_id}',
3956
- headers
3957
- )
3958
- except Exception as e:
3959
- raise ValueError(str(e))
3960
-
3961
- async def async_extract(
3962
- self,
3963
- urls: Optional[List[str]] = None,
3964
- *,
3965
- prompt: Optional[str] = None,
3966
- schema: Optional[Any] = None,
3967
- system_prompt: Optional[str] = None,
3968
- allow_external_links: Optional[bool] = False,
3969
- enable_web_search: Optional[bool] = False,
3970
- show_sources: Optional[bool] = False,
3971
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3972
- """
3973
- Initiate an asynchronous extraction job without waiting for completion.
3974
-
3975
- Args:
3976
- urls (Optional[List[str]]): URLs to extract from
3977
- prompt (Optional[str]): Custom extraction prompt
3978
- schema (Optional[Any]): JSON schema/Pydantic model
3979
- system_prompt (Optional[str]): System context
3980
- allow_external_links (Optional[bool]): Follow external links
3981
- enable_web_search (Optional[bool]): Enable web search
3982
- show_sources (Optional[bool]): Include source URLs
3983
- agent (Optional[Dict[str, Any]]): Agent configuration
3984
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3985
-
3986
- Returns:
3987
- ExtractResponse[Any] with:
3988
- * success (bool): Whether request succeeded
3989
- * data (Optional[Any]): Extracted data matching schema
3990
- * error (Optional[str]): Error message if any
3991
-
3992
- Raises:
3993
- ValueError: If job initiation fails
3994
- """
3995
- headers = self._prepare_headers()
3996
-
3997
- if not prompt and not schema:
3998
- raise ValueError("Either prompt or schema is required")
3999
-
4000
- if not urls and not prompt:
4001
- raise ValueError("Either urls or prompt is required")
4002
-
4003
- if schema:
4004
- schema = self._ensure_schema_dict(schema)
4005
-
4006
- request_data = ExtractResponse(
4007
- urls=urls or [],
4008
- allowExternalLinks=allow_external_links,
4009
- enableWebSearch=enable_web_search,
4010
- showSources=show_sources,
4011
- schema=schema,
4012
- origin=f'python-sdk@{version}'
4013
- )
4014
-
4015
- if prompt:
4016
- request_data['prompt'] = prompt
4017
- if system_prompt:
4018
- request_data['systemPrompt'] = system_prompt
4019
- if agent:
4020
- request_data['agent'] = agent
4021
-
4022
- try:
4023
- return await self._async_post_request(
4024
- f'{self.api_url}/v1/extract',
4025
- request_data,
4026
- headers
4027
- )
4028
- except Exception as e:
4029
- raise ValueError(str(e))
4030
-
4031
- async def generate_llms_text(
4032
- self,
4033
- url: str,
4034
- *,
4035
- max_urls: Optional[int] = None,
4036
- show_full_text: Optional[bool] = None,
4037
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
4038
- """
4039
- Generate LLMs.txt for a given URL and monitor until completion.
4040
-
4041
- Args:
4042
- url (str): Target URL to generate LLMs.txt from
4043
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
4044
- show_full_text (Optional[bool]): Include full text in output (default: False)
4045
- experimental_stream (Optional[bool]): Enable experimental streaming
4046
-
4047
- Returns:
4048
- GenerateLLMsTextStatusResponse containing:
4049
- * success (bool): Whether generation completed successfully
4050
- * status (str): Status of generation (processing/completed/failed)
4051
- * data (Dict[str, str], optional): Generated text with fields:
4052
- - llmstxt (str): Generated LLMs.txt content
4053
- - llmsfulltxt (str, optional): Full version if requested
4054
- * error (str, optional): Error message if generation failed
4055
- * expiresAt (str): When the generated data expires
4056
-
4057
- Raises:
4058
- Exception: If generation fails
4059
- """
4060
- params = {}
4061
- if max_urls is not None:
4062
- params['maxUrls'] = max_urls
4063
- if show_full_text is not None:
4064
- params['showFullText'] = show_full_text
4065
- if experimental_stream is not None:
4066
- params['__experimental_stream'] = experimental_stream
4067
-
4068
- response = await self.async_generate_llms_text(
4069
- url,
4070
- max_urls=max_urls,
4071
- show_full_text=show_full_text,
4072
- cache=cache,
4073
- experimental_stream=experimental_stream
4074
- )
4075
- if not response.get('success') or 'id' not in response:
4076
- return response
4077
-
4078
- job_id = response['id']
4079
- while True:
4080
- status = await self.check_generate_llms_text_status(job_id)
4081
-
4082
- if status['status'] == 'completed':
4083
- return status
4084
- elif status['status'] == 'failed':
4085
- raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4086
- elif status['status'] != 'processing':
4087
- break
4088
-
4089
- await asyncio.sleep(2)
4090
-
4091
- return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4092
-
4093
- async def async_generate_llms_text(
4094
- self,
4095
- url: str,
4096
- *,
4097
- max_urls: Optional[int] = None,
4098
- show_full_text: Optional[bool] = None,
4099
- cache: Optional[bool] = None,
4100
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4101
- """
4102
- Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4103
-
4104
- Args:
4105
- url (str): Target URL to generate LLMs.txt from
4106
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
4107
- show_full_text (Optional[bool]): Include full text in output (default: False)
4108
- cache (Optional[bool]): Whether to use cached content if available (default: True)
4109
- experimental_stream (Optional[bool]): Enable experimental streaming
4110
-
4111
- Returns:
4112
- GenerateLLMsTextResponse containing:
4113
- * success (bool): Whether job started successfully
4114
- * id (str): Unique identifier for the job
4115
- * error (str, optional): Error message if start failed
4116
-
4117
- Raises:
4118
- ValueError: If job initiation fails
4119
- """
4120
- params = {}
4121
- if max_urls is not None:
4122
- params['maxUrls'] = max_urls
4123
- if show_full_text is not None:
4124
- params['showFullText'] = show_full_text
4125
- if experimental_stream is not None:
4126
- params['__experimental_stream'] = experimental_stream
4127
-
4128
- params = GenerateLLMsTextParams(
4129
- maxUrls=max_urls,
4130
- showFullText=show_full_text,
4131
- cache=cache,
4132
- __experimental_stream=experimental_stream
4133
- )
4134
-
4135
- headers = self._prepare_headers()
4136
- json_data = {'url': url, **params.dict(exclude_none=True)}
4137
- json_data['origin'] = f"python-sdk@{version}"
4138
-
4139
- try:
4140
- return await self._async_post_request(
4141
- f'{self.api_url}/v1/llmstxt',
4142
- json_data,
4143
- headers
4144
- )
4145
- except Exception as e:
4146
- raise ValueError(str(e))
4147
-
4148
- async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4149
- """
4150
- Check the status of an asynchronous LLMs.txt generation job.
4151
-
4152
- Args:
4153
- id (str): The ID of the generation job
4154
-
4155
- Returns:
4156
- GenerateLLMsTextStatusResponse containing:
4157
- * success (bool): Whether generation completed successfully
4158
- * status (str): Status of generation (processing/completed/failed)
4159
- * data (Dict[str, str], optional): Generated text with fields:
4160
- - llmstxt (str): Generated LLMs.txt content
4161
- - llmsfulltxt (str, optional): Full version if requested
4162
- * error (str, optional): Error message if generation failed
4163
- * expiresAt (str): When the generated data expires
4164
-
4165
- Raises:
4166
- ValueError: If status check fails
4167
- """
4168
- headers = self._prepare_headers()
4169
- try:
4170
- return await self._async_get_request(
4171
- f'{self.api_url}/v1/llmstxt/{id}',
4172
- headers
4173
- )
4174
- except Exception as e:
4175
- raise ValueError(str(e))
4176
-
4177
- async def deep_research(
4178
- self,
4179
- query: str,
4180
- *,
4181
- max_depth: Optional[int] = None,
4182
- time_limit: Optional[int] = None,
4183
- max_urls: Optional[int] = None,
4184
- analysis_prompt: Optional[str] = None,
4185
- system_prompt: Optional[str] = None,
4186
- __experimental_stream_steps: Optional[bool] = None,
4187
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4188
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4189
- """
4190
- Initiates a deep research operation on a given query and polls until completion.
4191
-
4192
- Args:
4193
- query (str): Research query or topic to investigate
4194
- max_depth (Optional[int]): Maximum depth of research exploration
4195
- time_limit (Optional[int]): Time limit in seconds for research
4196
- max_urls (Optional[int]): Maximum number of URLs to process
4197
- analysis_prompt (Optional[str]): Custom prompt for analysis
4198
- system_prompt (Optional[str]): Custom system prompt
4199
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4200
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4201
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4202
-
4203
- Returns:
4204
- DeepResearchStatusResponse containing:
4205
- * success (bool): Whether research completed successfully
4206
- * status (str): Current state (processing/completed/failed)
4207
- * error (Optional[str]): Error message if failed
4208
- * id (str): Unique identifier for the research job
4209
- * data (Any): Research findings and analysis
4210
- * sources (List[Dict]): List of discovered sources
4211
- * activities (List[Dict]): Research progress log
4212
- * summaries (List[str]): Generated research summaries
4213
-
4214
- Raises:
4215
- Exception: If research fails
4216
- """
4217
- research_params = {}
4218
- if max_depth is not None:
4219
- research_params['maxDepth'] = max_depth
4220
- if time_limit is not None:
4221
- research_params['timeLimit'] = time_limit
4222
- if max_urls is not None:
4223
- research_params['maxUrls'] = max_urls
4224
- if analysis_prompt is not None:
4225
- research_params['analysisPrompt'] = analysis_prompt
4226
- if system_prompt is not None:
4227
- research_params['systemPrompt'] = system_prompt
4228
- if __experimental_stream_steps is not None:
4229
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4230
- research_params = DeepResearchParams(**research_params)
4231
-
4232
- response = await self.async_deep_research(
4233
- query,
4234
- max_depth=max_depth,
4235
- time_limit=time_limit,
4236
- max_urls=max_urls,
4237
- analysis_prompt=analysis_prompt,
4238
- system_prompt=system_prompt
4239
- )
4240
- if not response.get('success') or 'id' not in response:
4241
- return response
4242
-
4243
- job_id = response['id']
4244
- last_activity_count = 0
4245
- last_source_count = 0
4246
-
4247
- while True:
4248
- status = await self.check_deep_research_status(job_id)
4249
-
4250
- if on_activity and 'activities' in status:
4251
- new_activities = status['activities'][last_activity_count:]
4252
- for activity in new_activities:
4253
- on_activity(activity)
4254
- last_activity_count = len(status['activities'])
4255
-
4256
- if on_source and 'sources' in status:
4257
- new_sources = status['sources'][last_source_count:]
4258
- for source in new_sources:
4259
- on_source(source)
4260
- last_source_count = len(status['sources'])
4261
-
4262
- if status['status'] == 'completed':
4263
- return status
4264
- elif status['status'] == 'failed':
4265
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
4266
- elif status['status'] != 'processing':
4267
- break
4268
-
4269
- await asyncio.sleep(2)
4270
-
4271
- return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4272
-
4273
- async def async_deep_research(
4274
- self,
4275
- query: str,
4276
- *,
4277
- max_depth: Optional[int] = None,
4278
- time_limit: Optional[int] = None,
4279
- max_urls: Optional[int] = None,
4280
- analysis_prompt: Optional[str] = None,
4281
- system_prompt: Optional[str] = None,
4282
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4283
- """
4284
- Initiates an asynchronous deep research operation.
4285
-
4286
- Args:
4287
- query (str): Research query or topic to investigate
4288
- max_depth (Optional[int]): Maximum depth of research exploration
4289
- time_limit (Optional[int]): Time limit in seconds for research
4290
- max_urls (Optional[int]): Maximum number of URLs to process
4291
- analysis_prompt (Optional[str]): Custom prompt for analysis
4292
- system_prompt (Optional[str]): Custom system prompt
4293
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4294
-
4295
- Returns:
4296
- Dict[str, Any]: A response containing:
4297
- * success (bool): Whether the research initiation was successful
4298
- * id (str): The unique identifier for the research job
4299
- * error (str, optional): Error message if initiation failed
4300
-
4301
- Raises:
4302
- Exception: If the research initiation fails.
4303
- """
4304
- research_params = {}
4305
- if max_depth is not None:
4306
- research_params['maxDepth'] = max_depth
4307
- if time_limit is not None:
4308
- research_params['timeLimit'] = time_limit
4309
- if max_urls is not None:
4310
- research_params['maxUrls'] = max_urls
4311
- if analysis_prompt is not None:
4312
- research_params['analysisPrompt'] = analysis_prompt
4313
- if system_prompt is not None:
4314
- research_params['systemPrompt'] = system_prompt
4315
- if __experimental_stream_steps is not None:
4316
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4317
- research_params = DeepResearchParams(**research_params)
4318
-
4319
- headers = self._prepare_headers()
4320
-
4321
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
4322
- json_data['origin'] = f"python-sdk@{version}"
4323
-
4324
- try:
4325
- return await self._async_post_request(
4326
- f'{self.api_url}/v1/deep-research',
4327
- json_data,
4328
- headers
4329
- )
4330
- except Exception as e:
4331
- raise ValueError(str(e))
4332
-
4333
- async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4334
- """
4335
- Check the status of a deep research operation.
4336
-
4337
- Args:
4338
- id (str): The ID of the deep research operation.
4339
-
4340
- Returns:
4341
- DeepResearchResponse containing:
4342
-
4343
- Status:
4344
- * success - Whether research completed successfully
4345
- * status - Current state (processing/completed/failed)
4346
- * error - Error message if failed
4347
-
4348
- Results:
4349
- * id - Unique identifier for the research job
4350
- * data - Research findings and analysis
4351
- * sources - List of discovered sources
4352
- * activities - Research progress log
4353
- * summaries - Generated research summaries
4354
-
4355
- Raises:
4356
- Exception: If the status check fails.
4357
- """
4358
- headers = self._prepare_headers()
4359
- try:
4360
- return await self._async_get_request(
4361
- f'{self.api_url}/v1/deep-research/{id}',
4362
- headers
4363
- )
4364
- except Exception as e:
4365
- raise ValueError(str(e))
4366
-
4367
- async def search(
4368
- self,
4369
- query: str,
4370
- *,
4371
- limit: Optional[int] = None,
4372
- tbs: Optional[str] = None,
4373
- filter: Optional[str] = None,
4374
- lang: Optional[str] = None,
4375
- country: Optional[str] = None,
4376
- location: Optional[str] = None,
4377
- timeout: Optional[int] = None,
4378
- scrape_options: Optional[ScrapeOptions] = None,
4379
- params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4380
- **kwargs) -> SearchResponse:
4381
- """
4382
- Asynchronously search for content using Firecrawl.
4383
-
4384
- Args:
4385
- query (str): Search query string
4386
- limit (Optional[int]): Max results (default: 5)
4387
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
4388
- filter (Optional[str]): Custom result filter
4389
- lang (Optional[str]): Language code (default: "en")
4390
- country (Optional[str]): Country code (default: "us")
4391
- location (Optional[str]): Geo-targeting
4392
- timeout (Optional[int]): Request timeout in milliseconds
4393
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4394
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4395
- **kwargs: Additional keyword arguments for future compatibility
4396
-
4397
- Returns:
4398
- SearchResponse: Response containing:
4399
- * success (bool): Whether request succeeded
4400
- * data (List[FirecrawlDocument]): Search results
4401
- * warning (Optional[str]): Warning message if any
4402
- * error (Optional[str]): Error message if any
4403
-
4404
- Raises:
4405
- Exception: If search fails or response cannot be parsed
4406
- """
4407
- # Build search parameters
4408
- search_params = {}
4409
- if params:
4410
- if isinstance(params, dict):
4411
- search_params.update(params)
4412
- else:
4413
- search_params.update(params.dict(exclude_none=True))
4414
-
4415
- # Add individual parameters
4416
- if limit is not None:
4417
- search_params['limit'] = limit
4418
- if tbs is not None:
4419
- search_params['tbs'] = tbs
4420
- if filter is not None:
4421
- search_params['filter'] = filter
4422
- if lang is not None:
4423
- search_params['lang'] = lang
4424
- if country is not None:
4425
- search_params['country'] = country
4426
- if location is not None:
4427
- search_params['location'] = location
4428
- if timeout is not None:
4429
- search_params['timeout'] = timeout
4430
- if scrape_options is not None:
4431
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4432
-
4433
- # Add any additional kwargs
4434
- search_params.update(kwargs)
4435
-
4436
- # Create final params object
4437
- final_params = SearchParams(query=query, **search_params)
4438
- params_dict = final_params.dict(exclude_none=True)
4439
- params_dict['origin'] = f"python-sdk@{version}"
4440
-
4441
- return await self._async_post_request(
4442
- f"{self.api_url}/v1/search",
4443
- params_dict,
4444
- {"Authorization": f"Bearer {self.api_key}"}
4445
- )
4446
-
4447
- class AsyncCrawlWatcher(CrawlWatcher):
4448
- """
4449
- Async version of CrawlWatcher that properly handles async operations.
4450
- """
4451
- def __init__(self, id: str, app: AsyncFirecrawlApp):
4452
- super().__init__(id, app)
4453
-
4454
- async def connect(self) -> None:
4455
- """
4456
- Establishes async WebSocket connection and starts listening for messages.
4457
- """
4458
- async with websockets.connect(
4459
- self.ws_url,
4460
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4461
- ) as websocket:
4462
- await self._listen(websocket)
4463
-
4464
- async def _listen(self, websocket) -> None:
4465
- """
4466
- Listens for incoming WebSocket messages and handles them asynchronously.
4467
-
4468
- Args:
4469
- websocket: The WebSocket connection object
4470
- """
4471
- async for message in websocket:
4472
- msg = json.loads(message)
4473
- await self._handle_message(msg)
4474
-
4475
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
4476
- """
4477
- Handles incoming WebSocket messages based on their type asynchronously.
4478
-
4479
- Args:
4480
- msg (Dict[str, Any]): The message to handle
4481
- """
4482
- if msg['type'] == 'done':
4483
- self.status = 'completed'
4484
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4485
- elif msg['type'] == 'error':
4486
- self.status = 'failed'
4487
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4488
- elif msg['type'] == 'catchup':
4489
- self.status = msg['data']['status']
4490
- self.data.extend(msg['data'].get('data', []))
4491
- for doc in self.data:
4492
- self.dispatch_event('document', {'data': doc, 'id': self.id})
4493
- elif msg['type'] == 'document':
4494
- self.data.append(msg['data'])
4495
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4496
-
4497
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4498
- """
4499
- Handle errors from async API responses.
4500
- """
4501
- try:
4502
- error_data = await response.json()
4503
- error_message = error_data.get('error', 'No error message provided.')
4504
- error_details = error_data.get('details', 'No additional error details provided.')
4505
- except:
4506
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4507
-
4508
- # Use the app's method to get the error message
4509
- message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4510
-
4511
- raise aiohttp.ClientError(message)
4512
-
4513
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4514
- """
4515
- Generate a standardized error message based on HTTP status code for async operations.
4516
-
4517
- Args:
4518
- status_code (int): The HTTP status code from the response
4519
- action (str): Description of the action that was being performed
4520
- error_message (str): The error message from the API response
4521
- error_details (str): Additional error details from the API response
4522
-
4523
- Returns:
4524
- str: A formatted error message
4525
- """
4526
- return self._get_error_message(status_code, action, error_message, error_details)