firecrawl-py 2.15.0__py3-none-any.whl → 2.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

@@ -1,4610 +0,0 @@
1
- """
2
- FirecrawlApp Module
3
-
4
- This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs. The module uses requests for HTTP communication
7
- and handles retries for certain HTTP status codes.
8
-
9
- Classes:
10
- - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
- """
12
- import logging
13
- import os
14
- import time
15
- from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
- import json
17
- from datetime import datetime
18
- import re
19
- import warnings
20
- import requests
21
- import pydantic
22
- import websockets
23
- import aiohttp
24
- import asyncio
25
- from pydantic import Field
26
-
27
- # Suppress Pydantic warnings about attribute shadowing
28
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
33
-
34
- def get_version():
35
- try:
36
- from pathlib import Path
37
- package_path = os.path.dirname(__file__)
38
- version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
- if version_match:
41
- return version_match.group(1).strip()
42
- except Exception:
43
- print("Failed to get version from __init__.py")
44
- return None
45
-
46
- version = get_version()
47
-
48
- logger : logging.Logger = logging.getLogger("firecrawl")
49
-
50
- T = TypeVar('T')
51
-
52
- # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
- # """Metadata for a Firecrawl document."""
54
- # title: Optional[str] = None
55
- # description: Optional[str] = None
56
- # language: Optional[str] = None
57
- # keywords: Optional[str] = None
58
- # robots: Optional[str] = None
59
- # ogTitle: Optional[str] = None
60
- # ogDescription: Optional[str] = None
61
- # ogUrl: Optional[str] = None
62
- # ogImage: Optional[str] = None
63
- # ogAudio: Optional[str] = None
64
- # ogDeterminer: Optional[str] = None
65
- # ogLocale: Optional[str] = None
66
- # ogLocaleAlternate: Optional[List[str]] = None
67
- # ogSiteName: Optional[str] = None
68
- # ogVideo: Optional[str] = None
69
- # dctermsCreated: Optional[str] = None
70
- # dcDateCreated: Optional[str] = None
71
- # dcDate: Optional[str] = None
72
- # dctermsType: Optional[str] = None
73
- # dcType: Optional[str] = None
74
- # dctermsAudience: Optional[str] = None
75
- # dctermsSubject: Optional[str] = None
76
- # dcSubject: Optional[str] = None
77
- # dcDescription: Optional[str] = None
78
- # dctermsKeywords: Optional[str] = None
79
- # modifiedTime: Optional[str] = None
80
- # publishedTime: Optional[str] = None
81
- # articleTag: Optional[str] = None
82
- # articleSection: Optional[str] = None
83
- # sourceURL: Optional[str] = None
84
- # statusCode: Optional[int] = None
85
- # error: Optional[str] = None
86
-
87
- class AgentOptions(pydantic.BaseModel):
88
- """Configuration for the agent."""
89
- model: Literal["FIRE-1"] = "FIRE-1"
90
- prompt: Optional[str] = None
91
-
92
- class AgentOptionsExtract(pydantic.BaseModel):
93
- """Configuration for the agent in extract operations."""
94
- model: Literal["FIRE-1"] = "FIRE-1"
95
-
96
- class ActionsResult(pydantic.BaseModel):
97
- """Result of actions performed during scraping."""
98
- screenshots: List[str]
99
- pdfs: List[str]
100
-
101
- class ChangeTrackingData(pydantic.BaseModel):
102
- """
103
- Data for the change tracking format.
104
- """
105
- previousScrapeAt: Optional[str] = None
106
- changeStatus: str # "new" | "same" | "changed" | "removed"
107
- visibility: str # "visible" | "hidden"
108
- diff: Optional[Dict[str, Any]] = None
109
- json: Optional[Any] = None
110
-
111
- class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
112
- """Document retrieved or processed by Firecrawl."""
113
- url: Optional[str] = None
114
- markdown: Optional[str] = None
115
- html: Optional[str] = None
116
- rawHtml: Optional[str] = None
117
- links: Optional[List[str]] = None
118
- extract: Optional[T] = None
119
- json: Optional[T] = None
120
- screenshot: Optional[str] = None
121
- metadata: Optional[Any] = None
122
- actions: Optional[ActionsResult] = None
123
- title: Optional[str] = None # v1 search only
124
- description: Optional[str] = None # v1 search only
125
- changeTracking: Optional[ChangeTrackingData] = None
126
-
127
- class LocationConfig(pydantic.BaseModel):
128
- """Location configuration for scraping."""
129
- country: Optional[str] = None
130
- languages: Optional[List[str]] = None
131
-
132
- class WebhookConfig(pydantic.BaseModel):
133
- """Configuration for webhooks."""
134
- url: str
135
- headers: Optional[Dict[str, str]] = None
136
- metadata: Optional[Dict[str, str]] = None
137
- events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
138
-
139
- class ChangeTrackingOptions(pydantic.BaseModel):
140
- """Configuration for change tracking."""
141
- modes: Optional[List[Literal["git-diff", "json"]]] = None
142
- schema: Optional[Any] = None
143
- prompt: Optional[str] = None
144
- tag: Optional[str] = None
145
-
146
- class ScrapeOptions(pydantic.BaseModel):
147
- """Parameters for scraping operations."""
148
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
149
- headers: Optional[Dict[str, str]] = None
150
- includeTags: Optional[List[str]] = None
151
- excludeTags: Optional[List[str]] = None
152
- onlyMainContent: Optional[bool] = None
153
- waitFor: Optional[int] = None
154
- timeout: Optional[int] = None
155
- location: Optional[LocationConfig] = None
156
- mobile: Optional[bool] = None
157
- skipTlsVerification: Optional[bool] = None
158
- removeBase64Images: Optional[bool] = None
159
- blockAds: Optional[bool] = None
160
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None
161
- changeTrackingOptions: Optional[ChangeTrackingOptions] = None
162
- maxAge: Optional[int] = None
163
- storeInCache: Optional[bool] = None
164
- parsePDF: Optional[bool] = None
165
-
166
- class WaitAction(pydantic.BaseModel):
167
- """Wait action to perform during scraping."""
168
- type: Literal["wait"]
169
- milliseconds: Optional[int] = None
170
- selector: Optional[str] = None
171
-
172
- class ScreenshotAction(pydantic.BaseModel):
173
- """Screenshot action to perform during scraping."""
174
- type: Literal["screenshot"]
175
- fullPage: Optional[bool] = None
176
- quality: Optional[int] = None
177
-
178
- class ClickAction(pydantic.BaseModel):
179
- """Click action to perform during scraping."""
180
- type: Literal["click"]
181
- selector: str
182
-
183
- class WriteAction(pydantic.BaseModel):
184
- """Write action to perform during scraping."""
185
- type: Literal["write"]
186
- text: str
187
-
188
- class PressAction(pydantic.BaseModel):
189
- """Press action to perform during scraping."""
190
- type: Literal["press"]
191
- key: str
192
-
193
- class ScrollAction(pydantic.BaseModel):
194
- """Scroll action to perform during scraping."""
195
- type: Literal["scroll"]
196
- direction: Literal["up", "down"]
197
- selector: Optional[str] = None
198
-
199
- class ScrapeAction(pydantic.BaseModel):
200
- """Scrape action to perform during scraping."""
201
- type: Literal["scrape"]
202
-
203
- class ExecuteJavascriptAction(pydantic.BaseModel):
204
- """Execute javascript action to perform during scraping."""
205
- type: Literal["executeJavascript"]
206
- script: str
207
-
208
- class PDFAction(pydantic.BaseModel):
209
- """PDF action to perform during scraping."""
210
- type: Literal["pdf"]
211
- format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
212
- landscape: Optional[bool] = None
213
- scale: Optional[float] = None
214
-
215
- class ExtractAgent(pydantic.BaseModel):
216
- """Configuration for the agent in extract operations."""
217
- model: Literal["FIRE-1"] = "FIRE-1"
218
-
219
- class JsonConfig(pydantic.BaseModel):
220
- """Configuration for extraction."""
221
- prompt: Optional[str] = None
222
- schema: Optional[Any] = None
223
- systemPrompt: Optional[str] = None
224
- agent: Optional[ExtractAgent] = None
225
-
226
- class ScrapeParams(ScrapeOptions):
227
- """Parameters for scraping operations."""
228
- extract: Optional[JsonConfig] = None
229
- jsonOptions: Optional[JsonConfig] = None
230
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
231
- agent: Optional[AgentOptions] = None
232
- webhook: Optional[WebhookConfig] = None
233
-
234
- class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
235
- """Response from scraping operations."""
236
- success: bool = True
237
- warning: Optional[str] = None
238
- error: Optional[str] = None
239
-
240
- class BatchScrapeResponse(pydantic.BaseModel):
241
- """Response from batch scrape operations."""
242
- id: Optional[str] = None
243
- url: Optional[str] = None
244
- success: bool = True
245
- error: Optional[str] = None
246
- invalidURLs: Optional[List[str]] = None
247
-
248
- class BatchScrapeStatusResponse(pydantic.BaseModel):
249
- """Response from batch scrape status checks."""
250
- success: bool = True
251
- status: Literal["scraping", "completed", "failed", "cancelled"]
252
- completed: int
253
- total: int
254
- creditsUsed: int
255
- expiresAt: datetime
256
- next: Optional[str] = None
257
- data: List[FirecrawlDocument]
258
-
259
- class CrawlParams(pydantic.BaseModel):
260
- """Parameters for crawling operations."""
261
- includePaths: Optional[List[str]] = None
262
- excludePaths: Optional[List[str]] = None
263
- maxDepth: Optional[int] = None
264
- maxDiscoveryDepth: Optional[int] = None
265
- limit: Optional[int] = None
266
- allowBackwardLinks: Optional[bool] = None
267
- allowExternalLinks: Optional[bool] = None
268
- ignoreSitemap: Optional[bool] = None
269
- scrapeOptions: Optional[ScrapeOptions] = None
270
- webhook: Optional[Union[str, WebhookConfig]] = None
271
- deduplicateSimilarURLs: Optional[bool] = None
272
- ignoreQueryParameters: Optional[bool] = None
273
- regexOnFullURL: Optional[bool] = None
274
- delay: Optional[int] = None # Delay in seconds between scrapes
275
- maxConcurrency: Optional[int] = None
276
- allowSubdomains: Optional[bool] = None
277
-
278
- class CrawlResponse(pydantic.BaseModel):
279
- """Response from crawling operations."""
280
- id: Optional[str] = None
281
- url: Optional[str] = None
282
- success: bool = True
283
- error: Optional[str] = None
284
-
285
- class CrawlStatusResponse(pydantic.BaseModel):
286
- """Response from crawl status checks."""
287
- success: bool = True
288
- status: Literal["scraping", "completed", "failed", "cancelled"]
289
- completed: int
290
- total: int
291
- creditsUsed: int
292
- expiresAt: datetime
293
- next: Optional[str] = None
294
- data: List[FirecrawlDocument]
295
-
296
- class CrawlErrorsResponse(pydantic.BaseModel):
297
- """Response from crawl/batch scrape error monitoring."""
298
- errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
299
- robotsBlocked: List[str]
300
-
301
- class MapParams(pydantic.BaseModel):
302
- """Parameters for mapping operations."""
303
- search: Optional[str] = None
304
- ignoreSitemap: Optional[bool] = None
305
- includeSubdomains: Optional[bool] = None
306
- sitemapOnly: Optional[bool] = None
307
- limit: Optional[int] = None
308
- timeout: Optional[int] = None
309
- useIndex: Optional[bool] = None
310
-
311
- class MapResponse(pydantic.BaseModel):
312
- """Response from mapping operations."""
313
- success: bool = True
314
- links: Optional[List[str]] = None
315
- error: Optional[str] = None
316
-
317
- class ExtractParams(pydantic.BaseModel):
318
- """Parameters for extracting information from URLs."""
319
- prompt: Optional[str] = None
320
- schema: Optional[Any] = None
321
- systemPrompt: Optional[str] = None
322
- allowExternalLinks: Optional[bool] = None
323
- enableWebSearch: Optional[bool] = None
324
- includeSubdomains: Optional[bool] = None
325
- origin: Optional[str] = None
326
- showSources: Optional[bool] = None
327
- scrapeOptions: Optional[ScrapeOptions] = None
328
-
329
- class ExtractResponse(pydantic.BaseModel, Generic[T]):
330
- """Response from extract operations."""
331
- id: Optional[str] = None
332
- status: Optional[Literal["processing", "completed", "failed"]] = None
333
- expiresAt: Optional[datetime] = None
334
- success: bool = True
335
- data: Optional[T] = None
336
- error: Optional[str] = None
337
- warning: Optional[str] = None
338
- sources: Optional[Dict[Any, Any]] = None
339
-
340
- class SearchParams(pydantic.BaseModel):
341
- query: str
342
- limit: Optional[int] = 5
343
- tbs: Optional[str] = None
344
- filter: Optional[str] = None
345
- lang: Optional[str] = "en"
346
- country: Optional[str] = "us"
347
- location: Optional[str] = None
348
- origin: Optional[str] = "api"
349
- timeout: Optional[int] = 60000
350
- scrapeOptions: Optional[ScrapeOptions] = None
351
-
352
- class SearchResponse(pydantic.BaseModel):
353
- """Response from search operations."""
354
- success: bool = True
355
- data: List[FirecrawlDocument]
356
- warning: Optional[str] = None
357
- error: Optional[str] = None
358
-
359
- class GenerateLLMsTextParams(pydantic.BaseModel):
360
- """
361
- Parameters for the LLMs.txt generation operation.
362
- """
363
- maxUrls: Optional[int] = 10
364
- showFullText: Optional[bool] = False
365
- cache: Optional[bool] = True
366
- __experimental_stream: Optional[bool] = None
367
-
368
- class DeepResearchParams(pydantic.BaseModel):
369
- """
370
- Parameters for the deep research operation.
371
- """
372
- maxDepth: Optional[int] = 7
373
- timeLimit: Optional[int] = 270
374
- maxUrls: Optional[int] = 20
375
- analysisPrompt: Optional[str] = None
376
- systemPrompt: Optional[str] = None
377
- __experimental_streamSteps: Optional[bool] = None
378
-
379
- class DeepResearchResponse(pydantic.BaseModel):
380
- """
381
- Response from the deep research operation.
382
- """
383
- success: bool
384
- id: str
385
- error: Optional[str] = None
386
-
387
- class DeepResearchStatusResponse(pydantic.BaseModel):
388
- """
389
- Status response from the deep research operation.
390
- """
391
- success: bool
392
- data: Optional[Dict[str, Any]] = None
393
- status: str
394
- error: Optional[str] = None
395
- expiresAt: str
396
- currentDepth: int
397
- maxDepth: int
398
- activities: List[Dict[str, Any]]
399
- sources: List[Dict[str, Any]]
400
- summaries: List[str]
401
-
402
- class GenerateLLMsTextResponse(pydantic.BaseModel):
403
- """Response from LLMs.txt generation operations."""
404
- success: bool = True
405
- id: str
406
- error: Optional[str] = None
407
-
408
- class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
409
- llmstxt: str
410
- llmsfulltxt: Optional[str] = None
411
-
412
- class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
413
- """Status response from LLMs.txt generation operations."""
414
- success: bool = True
415
- data: Optional[GenerateLLMsTextStatusResponseData] = None
416
- status: Literal["processing", "completed", "failed"]
417
- error: Optional[str] = None
418
- expiresAt: str
419
-
420
- class SearchResponse(pydantic.BaseModel):
421
- """
422
- Response from the search operation.
423
- """
424
- success: bool
425
- data: List[Dict[str, Any]]
426
- warning: Optional[str] = None
427
- error: Optional[str] = None
428
-
429
- class ExtractParams(pydantic.BaseModel):
430
- """
431
- Parameters for the extract operation.
432
- """
433
- prompt: Optional[str] = None
434
- schema: Optional[Any] = pydantic.Field(None, alias='schema')
435
- system_prompt: Optional[str] = None
436
- allow_external_links: Optional[bool] = False
437
- enable_web_search: Optional[bool] = False
438
- # Just for backwards compatibility
439
- enableWebSearch: Optional[bool] = False
440
- show_sources: Optional[bool] = False
441
- agent: Optional[Dict[str, Any]] = None
442
-
443
- class FirecrawlApp:
444
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
445
- """
446
- Initialize the FirecrawlApp instance with API key, API URL.
447
-
448
- Args:
449
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
450
- api_url (Optional[str]): Base URL for the Firecrawl API.
451
- """
452
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
453
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
454
-
455
- # Only require API key when using cloud service
456
- if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
457
- logger.warning("No API key provided for cloud service")
458
- raise ValueError('No API key provided')
459
-
460
- logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
461
-
462
- def scrape_url(
463
- self,
464
- url: str,
465
- *,
466
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
467
- include_tags: Optional[List[str]] = None,
468
- exclude_tags: Optional[List[str]] = None,
469
- only_main_content: Optional[bool] = None,
470
- wait_for: Optional[int] = None,
471
- timeout: Optional[int] = None,
472
- location: Optional[LocationConfig] = None,
473
- mobile: Optional[bool] = None,
474
- skip_tls_verification: Optional[bool] = None,
475
- remove_base64_images: Optional[bool] = None,
476
- block_ads: Optional[bool] = None,
477
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
478
- parse_pdf: Optional[bool] = None,
479
- extract: Optional[JsonConfig] = None,
480
- json_options: Optional[JsonConfig] = None,
481
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
482
- change_tracking_options: Optional[ChangeTrackingOptions] = None,
483
- max_age: Optional[int] = None,
484
- store_in_cache: Optional[bool] = None,
485
- zero_data_retention: Optional[bool] = None,
486
- **kwargs) -> ScrapeResponse[Any]:
487
- """
488
- Scrape and extract content from a URL.
489
-
490
- Args:
491
- url (str): Target URL to scrape
492
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
493
- include_tags (Optional[List[str]]): HTML tags to include
494
- exclude_tags (Optional[List[str]]): HTML tags to exclude
495
- only_main_content (Optional[bool]): Extract main content only
496
- wait_for (Optional[int]): Wait for a specific element to appear
497
- timeout (Optional[int]): Request timeout (ms)
498
- location (Optional[LocationConfig]): Location configuration
499
- mobile (Optional[bool]): Use mobile user agent
500
- skip_tls_verification (Optional[bool]): Skip TLS verification
501
- remove_base64_images (Optional[bool]): Remove base64 images
502
- block_ads (Optional[bool]): Block ads
503
- proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
504
- extract (Optional[JsonConfig]): Content extraction settings
505
- json_options (Optional[JsonConfig]): JSON extraction settings
506
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
507
- change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
508
- zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
509
-
510
-
511
- Returns:
512
- ScrapeResponse with:
513
- * Requested content formats
514
- * Page metadata
515
- * Extraction results
516
- * Success/error status
517
-
518
- Raises:
519
- Exception: If scraping fails
520
- """
521
- headers = self._prepare_headers()
522
-
523
- # Build scrape parameters
524
- scrape_params = {
525
- 'url': url,
526
- 'origin': f"python-sdk@{version}"
527
- }
528
-
529
- # Add optional parameters if provided
530
- if formats:
531
- scrape_params['formats'] = formats
532
- if include_tags:
533
- scrape_params['includeTags'] = include_tags
534
- if exclude_tags:
535
- scrape_params['excludeTags'] = exclude_tags
536
- if only_main_content is not None:
537
- scrape_params['onlyMainContent'] = only_main_content
538
- if wait_for:
539
- scrape_params['waitFor'] = wait_for
540
- if timeout:
541
- scrape_params['timeout'] = timeout
542
- if location:
543
- scrape_params['location'] = location.dict(exclude_none=True)
544
- if mobile is not None:
545
- scrape_params['mobile'] = mobile
546
- if skip_tls_verification is not None:
547
- scrape_params['skipTlsVerification'] = skip_tls_verification
548
- if remove_base64_images is not None:
549
- scrape_params['removeBase64Images'] = remove_base64_images
550
- if block_ads is not None:
551
- scrape_params['blockAds'] = block_ads
552
- if proxy:
553
- scrape_params['proxy'] = proxy
554
- if parse_pdf is not None:
555
- scrape_params['parsePDF'] = parse_pdf
556
- if extract is not None:
557
- extract = self._ensure_schema_dict(extract)
558
- if isinstance(extract, dict) and "schema" in extract:
559
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
560
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
561
- if json_options is not None:
562
- json_options = self._ensure_schema_dict(json_options)
563
- if isinstance(json_options, dict) and "schema" in json_options:
564
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
565
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
566
- if actions:
567
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
568
- if change_tracking_options:
569
- scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
570
- if max_age is not None:
571
- scrape_params['maxAge'] = max_age
572
- if store_in_cache is not None:
573
- scrape_params['storeInCache'] = store_in_cache
574
- if zero_data_retention is not None:
575
- scrape_params['zeroDataRetention'] = zero_data_retention
576
-
577
- scrape_params.update(kwargs)
578
-
579
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
580
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
581
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
582
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
583
-
584
- # Make request
585
- response = requests.post(
586
- f'{self.api_url}/v1/scrape',
587
- headers=headers,
588
- json=scrape_params,
589
- timeout=(timeout + 5000 if timeout else None)
590
- )
591
-
592
- if response.status_code == 200:
593
- try:
594
- response_json = response.json()
595
- if response_json.get('success') and 'data' in response_json:
596
- return ScrapeResponse(**response_json['data'])
597
- elif "error" in response_json:
598
- raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
599
- else:
600
- raise Exception(f'Failed to scrape URL. Error: {response_json}')
601
- except ValueError:
602
- raise Exception('Failed to parse Firecrawl response as JSON.')
603
- else:
604
- self._handle_error(response, 'scrape URL')
605
-
606
- def search(
607
- self,
608
- query: str,
609
- *,
610
- limit: Optional[int] = None,
611
- tbs: Optional[str] = None,
612
- filter: Optional[str] = None,
613
- lang: Optional[str] = None,
614
- country: Optional[str] = None,
615
- location: Optional[str] = None,
616
- timeout: Optional[int] = None,
617
- scrape_options: Optional[ScrapeOptions] = None,
618
- **kwargs) -> SearchResponse:
619
- """
620
- Search for content using Firecrawl.
621
-
622
- Args:
623
- query (str): Search query string
624
- limit (Optional[int]): Max results (default: 5)
625
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
626
- filter (Optional[str]): Custom result filter
627
- lang (Optional[str]): Language code (default: "en")
628
- country (Optional[str]): Country code (default: "us")
629
- location (Optional[str]): Geo-targeting
630
- timeout (Optional[int]): Request timeout in milliseconds
631
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
632
- **kwargs: Additional keyword arguments for future compatibility
633
-
634
- Returns:
635
- SearchResponse: Response containing:
636
- * success (bool): Whether request succeeded
637
- * data (List[FirecrawlDocument]): Search results
638
- * warning (Optional[str]): Warning message if any
639
- * error (Optional[str]): Error message if any
640
-
641
- Raises:
642
- Exception: If search fails or response cannot be parsed
643
- """
644
- # Validate any additional kwargs
645
- self._validate_kwargs(kwargs, "search")
646
-
647
- # Build search parameters
648
- search_params = {}
649
-
650
- # Add individual parameters
651
- if limit is not None:
652
- search_params['limit'] = limit
653
- if tbs is not None:
654
- search_params['tbs'] = tbs
655
- if filter is not None:
656
- search_params['filter'] = filter
657
- if lang is not None:
658
- search_params['lang'] = lang
659
- if country is not None:
660
- search_params['country'] = country
661
- if location is not None:
662
- search_params['location'] = location
663
- if timeout is not None:
664
- search_params['timeout'] = timeout
665
- if scrape_options is not None:
666
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
667
-
668
- # Add any additional kwargs
669
- search_params.update(kwargs)
670
- _integration = search_params.get('integration')
671
-
672
- # Create final params object
673
- final_params = SearchParams(query=query, **search_params)
674
- params_dict = final_params.dict(exclude_none=True)
675
- params_dict['origin'] = f"python-sdk@{version}"
676
-
677
- if _integration:
678
- params_dict['integration'] = _integration
679
-
680
- # Make request
681
- response = requests.post(
682
- f"{self.api_url}/v1/search",
683
- headers={"Authorization": f"Bearer {self.api_key}"},
684
- json=params_dict
685
- )
686
-
687
- if response.status_code == 200:
688
- try:
689
- response_json = response.json()
690
- if response_json.get('success') and 'data' in response_json:
691
- return SearchResponse(**response_json)
692
- elif "error" in response_json:
693
- raise Exception(f'Search failed. Error: {response_json["error"]}')
694
- else:
695
- raise Exception(f'Search failed. Error: {response_json}')
696
- except ValueError:
697
- raise Exception('Failed to parse Firecrawl response as JSON.')
698
- else:
699
- self._handle_error(response, 'search')
700
-
701
- def crawl_url(
702
- self,
703
- url: str,
704
- *,
705
- include_paths: Optional[List[str]] = None,
706
- exclude_paths: Optional[List[str]] = None,
707
- max_depth: Optional[int] = None,
708
- max_discovery_depth: Optional[int] = None,
709
- limit: Optional[int] = None,
710
- allow_backward_links: Optional[bool] = None,
711
- crawl_entire_domain: Optional[bool] = None,
712
- allow_external_links: Optional[bool] = None,
713
- ignore_sitemap: Optional[bool] = None,
714
- scrape_options: Optional[ScrapeOptions] = None,
715
- webhook: Optional[Union[str, WebhookConfig]] = None,
716
- deduplicate_similar_urls: Optional[bool] = None,
717
- ignore_query_parameters: Optional[bool] = None,
718
- regex_on_full_url: Optional[bool] = None,
719
- delay: Optional[int] = None,
720
- allow_subdomains: Optional[bool] = None,
721
- max_concurrency: Optional[int] = None,
722
- zero_data_retention: Optional[bool] = None,
723
- poll_interval: Optional[int] = 2,
724
- idempotency_key: Optional[str] = None,
725
- **kwargs
726
- ) -> CrawlStatusResponse:
727
- """
728
- Crawl a website starting from a URL.
729
-
730
- Args:
731
- url (str): Target URL to start crawling from
732
- include_paths (Optional[List[str]]): Patterns of URLs to include
733
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
734
- max_depth (Optional[int]): Maximum crawl depth
735
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
736
- limit (Optional[int]): Maximum pages to crawl
737
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
738
- crawl_entire_domain (Optional[bool]): Follow parent directory links
739
- allow_external_links (Optional[bool]): Follow external domain links
740
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
741
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
742
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
743
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
744
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
745
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
746
- delay (Optional[int]): Delay in seconds between scrapes
747
- allow_subdomains (Optional[bool]): Follow subdomains
748
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
749
- zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
750
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
751
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
752
- **kwargs: Additional parameters to pass to the API
753
-
754
- Returns:
755
- CrawlStatusResponse with:
756
- * Crawling status and progress
757
- * Crawled page contents
758
- * Success/error information
759
-
760
- Raises:
761
- Exception: If crawl fails
762
- """
763
- # Validate any additional kwargs
764
- self._validate_kwargs(kwargs, "crawl_url")
765
-
766
- crawl_params = {}
767
-
768
- # Add individual parameters
769
- if include_paths is not None:
770
- crawl_params['includePaths'] = include_paths
771
- if exclude_paths is not None:
772
- crawl_params['excludePaths'] = exclude_paths
773
- if max_depth is not None:
774
- crawl_params['maxDepth'] = max_depth
775
- if max_discovery_depth is not None:
776
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
777
- if limit is not None:
778
- crawl_params['limit'] = limit
779
- if crawl_entire_domain is not None:
780
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
781
- elif allow_backward_links is not None:
782
- crawl_params['allowBackwardLinks'] = allow_backward_links
783
- if allow_external_links is not None:
784
- crawl_params['allowExternalLinks'] = allow_external_links
785
- if ignore_sitemap is not None:
786
- crawl_params['ignoreSitemap'] = ignore_sitemap
787
- if scrape_options is not None:
788
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
789
- if webhook is not None:
790
- crawl_params['webhook'] = webhook
791
- if deduplicate_similar_urls is not None:
792
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
793
- if ignore_query_parameters is not None:
794
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
795
- if regex_on_full_url is not None:
796
- crawl_params['regexOnFullURL'] = regex_on_full_url
797
- if delay is not None:
798
- crawl_params['delay'] = delay
799
- if allow_subdomains is not None:
800
- crawl_params['allowSubdomains'] = allow_subdomains
801
- if max_concurrency is not None:
802
- crawl_params['maxConcurrency'] = max_concurrency
803
- if zero_data_retention is not None:
804
- crawl_params['zeroDataRetention'] = zero_data_retention
805
- # Add any additional kwargs
806
- crawl_params.update(kwargs)
807
- _integration = crawl_params.get('integration')
808
-
809
- # Create final params object
810
- final_params = CrawlParams(**crawl_params)
811
- params_dict = final_params.dict(exclude_none=True)
812
- params_dict['url'] = url
813
- params_dict['origin'] = f"python-sdk@{version}"
814
-
815
- if _integration:
816
- params_dict['integration'] = _integration
817
-
818
- # Make request
819
- headers = self._prepare_headers(idempotency_key)
820
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
821
-
822
- if response.status_code == 200:
823
- try:
824
- id = response.json().get('id')
825
- except:
826
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
827
- return self._monitor_job_status(id, headers, poll_interval)
828
- else:
829
- self._handle_error(response, 'start crawl job')
830
-
831
- def async_crawl_url(
832
- self,
833
- url: str,
834
- *,
835
- include_paths: Optional[List[str]] = None,
836
- exclude_paths: Optional[List[str]] = None,
837
- max_depth: Optional[int] = None,
838
- max_discovery_depth: Optional[int] = None,
839
- limit: Optional[int] = None,
840
- allow_backward_links: Optional[bool] = None,
841
- crawl_entire_domain: Optional[bool] = None,
842
- allow_external_links: Optional[bool] = None,
843
- ignore_sitemap: Optional[bool] = None,
844
- scrape_options: Optional[ScrapeOptions] = None,
845
- webhook: Optional[Union[str, WebhookConfig]] = None,
846
- deduplicate_similar_urls: Optional[bool] = None,
847
- ignore_query_parameters: Optional[bool] = None,
848
- regex_on_full_url: Optional[bool] = None,
849
- delay: Optional[int] = None,
850
- allow_subdomains: Optional[bool] = None,
851
- max_concurrency: Optional[int] = None,
852
- zero_data_retention: Optional[bool] = None,
853
- idempotency_key: Optional[str] = None,
854
- **kwargs
855
- ) -> CrawlResponse:
856
- """
857
- Start an asynchronous crawl job.
858
-
859
- Args:
860
- url (str): Target URL to start crawling from
861
- include_paths (Optional[List[str]]): Patterns of URLs to include
862
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
863
- max_depth (Optional[int]): Maximum crawl depth
864
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
865
- limit (Optional[int]): Maximum pages to crawl
866
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
867
- crawl_entire_domain (Optional[bool]): Follow parent directory links
868
- allow_external_links (Optional[bool]): Follow external domain links
869
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
870
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
871
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
872
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
873
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
874
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
875
- delay (Optional[int]): Delay in seconds between scrapes
876
- allow_subdomains (Optional[bool]): Follow subdomains
877
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
878
- zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
879
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
880
- **kwargs: Additional parameters to pass to the API
881
-
882
- Returns:
883
- CrawlResponse with:
884
- * success - Whether crawl started successfully
885
- * id - Unique identifier for the crawl job
886
- * url - Status check URL for the crawl
887
- * error - Error message if start failed
888
-
889
- Raises:
890
- Exception: If crawl initiation fails
891
- """
892
- # Validate any additional kwargs
893
- self._validate_kwargs(kwargs, "async_crawl_url")
894
-
895
- crawl_params = {}
896
-
897
- # Add individual parameters
898
- if include_paths is not None:
899
- crawl_params['includePaths'] = include_paths
900
- if exclude_paths is not None:
901
- crawl_params['excludePaths'] = exclude_paths
902
- if max_depth is not None:
903
- crawl_params['maxDepth'] = max_depth
904
- if max_discovery_depth is not None:
905
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
906
- if limit is not None:
907
- crawl_params['limit'] = limit
908
- if crawl_entire_domain is not None:
909
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
910
- elif allow_backward_links is not None:
911
- crawl_params['allowBackwardLinks'] = allow_backward_links
912
- if allow_external_links is not None:
913
- crawl_params['allowExternalLinks'] = allow_external_links
914
- if ignore_sitemap is not None:
915
- crawl_params['ignoreSitemap'] = ignore_sitemap
916
- if scrape_options is not None:
917
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
918
- if webhook is not None:
919
- crawl_params['webhook'] = webhook
920
- if deduplicate_similar_urls is not None:
921
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
922
- if ignore_query_parameters is not None:
923
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
924
- if regex_on_full_url is not None:
925
- crawl_params['regexOnFullURL'] = regex_on_full_url
926
- if delay is not None:
927
- crawl_params['delay'] = delay
928
- if allow_subdomains is not None:
929
- crawl_params['allowSubdomains'] = allow_subdomains
930
- if max_concurrency is not None:
931
- crawl_params['maxConcurrency'] = max_concurrency
932
- if zero_data_retention is not None:
933
- crawl_params['zeroDataRetention'] = zero_data_retention
934
- # Add any additional kwargs
935
- crawl_params.update(kwargs)
936
-
937
- # Create final params object
938
- final_params = CrawlParams(**crawl_params)
939
- params_dict = final_params.dict(exclude_none=True)
940
- params_dict['url'] = url
941
- params_dict['origin'] = f"python-sdk@{version}"
942
-
943
- # Make request
944
- headers = self._prepare_headers(idempotency_key)
945
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
946
-
947
- if response.status_code == 200:
948
- try:
949
- return CrawlResponse(**response.json())
950
- except:
951
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
952
- else:
953
- self._handle_error(response, 'start crawl job')
954
-
955
- def check_crawl_status(self, id: str) -> CrawlStatusResponse:
956
- """
957
- Check the status and results of a crawl job.
958
-
959
- Args:
960
- id: Unique identifier for the crawl job
961
-
962
- Returns:
963
- CrawlStatusResponse containing:
964
-
965
- Status Information:
966
- * status - Current state (scraping/completed/failed/cancelled)
967
- * completed - Number of pages crawled
968
- * total - Total pages to crawl
969
- * creditsUsed - API credits consumed
970
- * expiresAt - Data expiration timestamp
971
-
972
- Results:
973
- * data - List of crawled documents
974
- * next - URL for next page of results (if paginated)
975
- * success - Whether status check succeeded
976
- * error - Error message if failed
977
-
978
- Raises:
979
- Exception: If status check fails
980
- """
981
- endpoint = f'/v1/crawl/{id}'
982
-
983
- headers = self._prepare_headers()
984
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
985
- if response.status_code == 200:
986
- try:
987
- status_data = response.json()
988
- except:
989
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
990
- if status_data['status'] == 'completed':
991
- if 'data' in status_data:
992
- data = status_data['data']
993
- while 'next' in status_data:
994
- if len(status_data['data']) == 0:
995
- break
996
- next_url = status_data.get('next')
997
- if not next_url:
998
- logger.warning("Expected 'next' URL is missing.")
999
- break
1000
- try:
1001
- status_response = self._get_request(next_url, headers)
1002
- if status_response.status_code != 200:
1003
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
1004
- break
1005
- try:
1006
- next_data = status_response.json()
1007
- except:
1008
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1009
- data.extend(next_data.get('data', []))
1010
- status_data = next_data
1011
- except Exception as e:
1012
- logger.error(f"Error during pagination request: {e}")
1013
- break
1014
- status_data['data'] = data
1015
-
1016
- response = {
1017
- 'status': status_data.get('status'),
1018
- 'total': status_data.get('total'),
1019
- 'completed': status_data.get('completed'),
1020
- 'creditsUsed': status_data.get('creditsUsed'),
1021
- 'expiresAt': status_data.get('expiresAt'),
1022
- 'data': status_data.get('data')
1023
- }
1024
-
1025
- if 'error' in status_data:
1026
- response['error'] = status_data['error']
1027
-
1028
- if 'next' in status_data:
1029
- response['next'] = status_data['next']
1030
-
1031
- return CrawlStatusResponse(
1032
- success=False if 'error' in status_data else True,
1033
- **response
1034
- )
1035
- else:
1036
- self._handle_error(response, 'check crawl status')
1037
-
1038
- def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
1039
- """
1040
- Returns information about crawl errors.
1041
-
1042
- Args:
1043
- id (str): The ID of the crawl job
1044
-
1045
- Returns:
1046
- CrawlErrorsResponse containing:
1047
- * errors (List[Dict[str, str]]): List of errors with fields:
1048
- - id (str): Error ID
1049
- - timestamp (str): When the error occurred
1050
- - url (str): URL that caused the error
1051
- - error (str): Error message
1052
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1053
-
1054
- Raises:
1055
- Exception: If error check fails
1056
- """
1057
- headers = self._prepare_headers()
1058
- response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
1059
- if response.status_code == 200:
1060
- try:
1061
- return CrawlErrorsResponse(**response.json())
1062
- except:
1063
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1064
- else:
1065
- self._handle_error(response, "check crawl errors")
1066
-
1067
- def cancel_crawl(self, id: str) -> Dict[str, Any]:
1068
- """
1069
- Cancel an asynchronous crawl job.
1070
-
1071
- Args:
1072
- id (str): The ID of the crawl job to cancel
1073
-
1074
- Returns:
1075
- Dict[str, Any] containing:
1076
- * success (bool): Whether cancellation was successful
1077
- * error (str, optional): Error message if cancellation failed
1078
-
1079
- Raises:
1080
- Exception: If cancellation fails
1081
- """
1082
- headers = self._prepare_headers()
1083
- response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
1084
- if response.status_code == 200:
1085
- try:
1086
- return response.json()
1087
- except:
1088
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1089
- else:
1090
- self._handle_error(response, "cancel crawl job")
1091
-
1092
- def crawl_url_and_watch(
1093
- self,
1094
- url: str,
1095
- *,
1096
- include_paths: Optional[List[str]] = None,
1097
- exclude_paths: Optional[List[str]] = None,
1098
- max_depth: Optional[int] = None,
1099
- max_discovery_depth: Optional[int] = None,
1100
- limit: Optional[int] = None,
1101
- allow_backward_links: Optional[bool] = None,
1102
- crawl_entire_domain: Optional[bool] = None,
1103
- allow_external_links: Optional[bool] = None,
1104
- ignore_sitemap: Optional[bool] = None,
1105
- scrape_options: Optional[ScrapeOptions] = None,
1106
- webhook: Optional[Union[str, WebhookConfig]] = None,
1107
- deduplicate_similar_urls: Optional[bool] = None,
1108
- ignore_query_parameters: Optional[bool] = None,
1109
- regex_on_full_url: Optional[bool] = None,
1110
- delay: Optional[int] = None,
1111
- allow_subdomains: Optional[bool] = None,
1112
- max_concurrency: Optional[int] = None,
1113
- zero_data_retention: Optional[bool] = None,
1114
- idempotency_key: Optional[str] = None,
1115
- **kwargs
1116
- ) -> 'CrawlWatcher':
1117
- """
1118
- Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1119
-
1120
- Args:
1121
- url (str): Target URL to start crawling from
1122
- include_paths (Optional[List[str]]): Patterns of URLs to include
1123
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1124
- max_depth (Optional[int]): Maximum crawl depth
1125
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1126
- limit (Optional[int]): Maximum pages to crawl
1127
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1128
- crawl_entire_domain (Optional[bool]): Follow parent directory links
1129
- allow_external_links (Optional[bool]): Follow external domain links
1130
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1131
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1132
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1133
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1134
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
1135
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
1136
- delay (Optional[int]): Delay in seconds between scrapes
1137
- allow_subdomains (Optional[bool]): Follow subdomains
1138
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1139
- zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1140
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1141
- **kwargs: Additional parameters to pass to the API
1142
-
1143
- Returns:
1144
- CrawlWatcher: An instance to monitor the crawl job via WebSocket
1145
-
1146
- Raises:
1147
- Exception: If crawl job fails to start
1148
- """
1149
- crawl_response = self.async_crawl_url(
1150
- url,
1151
- include_paths=include_paths,
1152
- exclude_paths=exclude_paths,
1153
- max_depth=max_depth,
1154
- max_discovery_depth=max_discovery_depth,
1155
- limit=limit,
1156
- allow_backward_links=allow_backward_links,
1157
- allow_external_links=allow_external_links,
1158
- ignore_sitemap=ignore_sitemap,
1159
- scrape_options=scrape_options,
1160
- webhook=webhook,
1161
- deduplicate_similar_urls=deduplicate_similar_urls,
1162
- ignore_query_parameters=ignore_query_parameters,
1163
- regex_on_full_url=regex_on_full_url,
1164
- delay=delay,
1165
- allow_subdomains=allow_subdomains,
1166
- max_concurrency=max_concurrency,
1167
- zero_data_retention=zero_data_retention,
1168
- idempotency_key=idempotency_key,
1169
- **kwargs
1170
- )
1171
- if crawl_response.success and crawl_response.id:
1172
- return CrawlWatcher(crawl_response.id, self)
1173
- else:
1174
- raise Exception("Crawl job failed to start")
1175
-
1176
- def map_url(
1177
- self,
1178
- url: str,
1179
- *,
1180
- search: Optional[str] = None,
1181
- ignore_sitemap: Optional[bool] = None,
1182
- include_subdomains: Optional[bool] = None,
1183
- sitemap_only: Optional[bool] = None,
1184
- limit: Optional[int] = None,
1185
- timeout: Optional[int] = None,
1186
- use_index: Optional[bool] = None,
1187
- **kwargs) -> MapResponse:
1188
- """
1189
- Map and discover links from a URL.
1190
-
1191
- Args:
1192
- url (str): Target URL to map
1193
- search (Optional[str]): Filter pattern for URLs
1194
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1195
- include_subdomains (Optional[bool]): Include subdomain links
1196
- sitemap_only (Optional[bool]): Only use sitemap.xml
1197
- limit (Optional[int]): Maximum URLs to return
1198
- timeout (Optional[int]): Request timeout in milliseconds
1199
- **kwargs: Additional parameters to pass to the API
1200
-
1201
- Returns:
1202
- MapResponse: Response containing:
1203
- * success (bool): Whether request succeeded
1204
- * links (List[str]): Discovered URLs
1205
- * error (Optional[str]): Error message if any
1206
-
1207
- Raises:
1208
- Exception: If mapping fails or response cannot be parsed
1209
- """
1210
- # Validate any additional kwargs
1211
- self._validate_kwargs(kwargs, "map_url")
1212
-
1213
- # Build map parameters
1214
- map_params = {}
1215
-
1216
- # Add individual parameters
1217
- if search is not None:
1218
- map_params['search'] = search
1219
- if ignore_sitemap is not None:
1220
- map_params['ignoreSitemap'] = ignore_sitemap
1221
- if include_subdomains is not None:
1222
- map_params['includeSubdomains'] = include_subdomains
1223
- if sitemap_only is not None:
1224
- map_params['sitemapOnly'] = sitemap_only
1225
- if limit is not None:
1226
- map_params['limit'] = limit
1227
- if timeout is not None:
1228
- map_params['timeout'] = timeout
1229
- if use_index is not None:
1230
- map_params['useIndex'] = use_index
1231
-
1232
- # Add any additional kwargs
1233
- map_params.update(kwargs)
1234
- _integration = map_params.get('integration')
1235
-
1236
- # Create final params object
1237
- final_params = MapParams(**map_params)
1238
- params_dict = final_params.dict(exclude_none=True)
1239
- params_dict['url'] = url
1240
- params_dict['origin'] = f"python-sdk@{version}"
1241
-
1242
- if _integration:
1243
- params_dict['integration'] = _integration
1244
-
1245
- # Make request
1246
- response = requests.post(
1247
- f"{self.api_url}/v1/map",
1248
- headers={"Authorization": f"Bearer {self.api_key}"},
1249
- json=params_dict
1250
- )
1251
-
1252
- if response.status_code == 200:
1253
- try:
1254
- response_json = response.json()
1255
- if response_json.get('success') and 'links' in response_json:
1256
- return MapResponse(**response_json)
1257
- elif "error" in response_json:
1258
- raise Exception(f'Map failed. Error: {response_json["error"]}')
1259
- else:
1260
- raise Exception(f'Map failed. Error: {response_json}')
1261
- except ValueError:
1262
- raise Exception('Failed to parse Firecrawl response as JSON.')
1263
- else:
1264
- self._handle_error(response, 'map')
1265
-
1266
- def batch_scrape_urls(
1267
- self,
1268
- urls: List[str],
1269
- *,
1270
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1271
- headers: Optional[Dict[str, str]] = None,
1272
- include_tags: Optional[List[str]] = None,
1273
- exclude_tags: Optional[List[str]] = None,
1274
- only_main_content: Optional[bool] = None,
1275
- wait_for: Optional[int] = None,
1276
- timeout: Optional[int] = None,
1277
- location: Optional[LocationConfig] = None,
1278
- mobile: Optional[bool] = None,
1279
- skip_tls_verification: Optional[bool] = None,
1280
- remove_base64_images: Optional[bool] = None,
1281
- block_ads: Optional[bool] = None,
1282
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1283
- extract: Optional[JsonConfig] = None,
1284
- json_options: Optional[JsonConfig] = None,
1285
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1286
- agent: Optional[AgentOptions] = None,
1287
- poll_interval: Optional[int] = 2,
1288
- max_concurrency: Optional[int] = None,
1289
- zero_data_retention: Optional[bool] = None,
1290
- idempotency_key: Optional[str] = None,
1291
- **kwargs
1292
- ) -> BatchScrapeStatusResponse:
1293
- """
1294
- Batch scrape multiple URLs and monitor until completion.
1295
-
1296
- Args:
1297
- urls (List[str]): URLs to scrape
1298
- formats (Optional[List[Literal]]): Content formats to retrieve
1299
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1300
- include_tags (Optional[List[str]]): HTML tags to include
1301
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1302
- only_main_content (Optional[bool]): Extract main content only
1303
- wait_for (Optional[int]): Wait time in milliseconds
1304
- timeout (Optional[int]): Request timeout in milliseconds
1305
- location (Optional[LocationConfig]): Location configuration
1306
- mobile (Optional[bool]): Use mobile user agent
1307
- skip_tls_verification (Optional[bool]): Skip TLS verification
1308
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1309
- block_ads (Optional[bool]): Block advertisements
1310
- proxy (Optional[Literal]): Proxy type to use
1311
- extract (Optional[JsonConfig]): Content extraction config
1312
- json_options (Optional[JsonConfig]): JSON extraction config
1313
- actions (Optional[List[Union]]): Actions to perform
1314
- agent (Optional[AgentOptions]): Agent configuration
1315
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1316
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
1317
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1318
- **kwargs: Additional parameters to pass to the API
1319
-
1320
- Returns:
1321
- BatchScrapeStatusResponse with:
1322
- * Scraping status and progress
1323
- * Scraped content for each URL
1324
- * Success/error information
1325
-
1326
- Raises:
1327
- Exception: If batch scrape fails
1328
- """
1329
- # Validate any additional kwargs
1330
- self._validate_kwargs(kwargs, "batch_scrape_urls")
1331
-
1332
- scrape_params = {}
1333
-
1334
- # Add individual parameters
1335
- if formats is not None:
1336
- scrape_params['formats'] = formats
1337
- if headers is not None:
1338
- scrape_params['headers'] = headers
1339
- if include_tags is not None:
1340
- scrape_params['includeTags'] = include_tags
1341
- if exclude_tags is not None:
1342
- scrape_params['excludeTags'] = exclude_tags
1343
- if only_main_content is not None:
1344
- scrape_params['onlyMainContent'] = only_main_content
1345
- if wait_for is not None:
1346
- scrape_params['waitFor'] = wait_for
1347
- if timeout is not None:
1348
- scrape_params['timeout'] = timeout
1349
- if location is not None:
1350
- scrape_params['location'] = location.dict(exclude_none=True)
1351
- if mobile is not None:
1352
- scrape_params['mobile'] = mobile
1353
- if skip_tls_verification is not None:
1354
- scrape_params['skipTlsVerification'] = skip_tls_verification
1355
- if remove_base64_images is not None:
1356
- scrape_params['removeBase64Images'] = remove_base64_images
1357
- if block_ads is not None:
1358
- scrape_params['blockAds'] = block_ads
1359
- if proxy is not None:
1360
- scrape_params['proxy'] = proxy
1361
- if extract is not None:
1362
- extract = self._ensure_schema_dict(extract)
1363
- if isinstance(extract, dict) and "schema" in extract:
1364
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1365
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1366
- if json_options is not None:
1367
- json_options = self._ensure_schema_dict(json_options)
1368
- if isinstance(json_options, dict) and "schema" in json_options:
1369
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1370
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1371
- if actions is not None:
1372
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1373
- if agent is not None:
1374
- scrape_params['agent'] = agent.dict(exclude_none=True)
1375
- if max_concurrency is not None:
1376
- scrape_params['maxConcurrency'] = max_concurrency
1377
- if zero_data_retention is not None:
1378
- scrape_params['zeroDataRetention'] = zero_data_retention
1379
-
1380
- # Add any additional kwargs
1381
- scrape_params.update(kwargs)
1382
-
1383
- # Create final params object
1384
- final_params = ScrapeParams(**scrape_params)
1385
- params_dict = final_params.dict(exclude_none=True)
1386
- params_dict['urls'] = urls
1387
- params_dict['origin'] = f"python-sdk@{version}"
1388
-
1389
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1390
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1391
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1392
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1393
-
1394
- # Make request
1395
- headers = self._prepare_headers(idempotency_key)
1396
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1397
-
1398
- if response.status_code == 200:
1399
- try:
1400
- id = response.json().get('id')
1401
- except:
1402
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1403
- return self._monitor_job_status(id, headers, poll_interval)
1404
- else:
1405
- self._handle_error(response, 'start batch scrape job')
1406
-
1407
- def async_batch_scrape_urls(
1408
- self,
1409
- urls: List[str],
1410
- *,
1411
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1412
- headers: Optional[Dict[str, str]] = None,
1413
- include_tags: Optional[List[str]] = None,
1414
- exclude_tags: Optional[List[str]] = None,
1415
- only_main_content: Optional[bool] = None,
1416
- wait_for: Optional[int] = None,
1417
- timeout: Optional[int] = None,
1418
- location: Optional[LocationConfig] = None,
1419
- mobile: Optional[bool] = None,
1420
- skip_tls_verification: Optional[bool] = None,
1421
- remove_base64_images: Optional[bool] = None,
1422
- block_ads: Optional[bool] = None,
1423
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1424
- extract: Optional[JsonConfig] = None,
1425
- json_options: Optional[JsonConfig] = None,
1426
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1427
- agent: Optional[AgentOptions] = None,
1428
- max_concurrency: Optional[int] = None,
1429
- idempotency_key: Optional[str] = None,
1430
- zero_data_retention: Optional[bool] = None,
1431
- **kwargs
1432
- ) -> BatchScrapeResponse:
1433
- """
1434
- Initiate a batch scrape job asynchronously.
1435
-
1436
- Args:
1437
- urls (List[str]): URLs to scrape
1438
- formats (Optional[List[Literal]]): Content formats to retrieve
1439
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1440
- include_tags (Optional[List[str]]): HTML tags to include
1441
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1442
- only_main_content (Optional[bool]): Extract main content only
1443
- wait_for (Optional[int]): Wait time in milliseconds
1444
- timeout (Optional[int]): Request timeout in milliseconds
1445
- location (Optional[LocationConfig]): Location configuration
1446
- mobile (Optional[bool]): Use mobile user agent
1447
- skip_tls_verification (Optional[bool]): Skip TLS verification
1448
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1449
- block_ads (Optional[bool]): Block advertisements
1450
- proxy (Optional[Literal]): Proxy type to use
1451
- extract (Optional[JsonConfig]): Content extraction config
1452
- json_options (Optional[JsonConfig]): JSON extraction config
1453
- actions (Optional[List[Union]]): Actions to perform
1454
- agent (Optional[AgentOptions]): Agent configuration
1455
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1456
- zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1457
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1458
- **kwargs: Additional parameters to pass to the API
1459
-
1460
- Returns:
1461
- BatchScrapeResponse with:
1462
- * success - Whether job started successfully
1463
- * id - Unique identifier for the job
1464
- * url - Status check URL
1465
- * error - Error message if start failed
1466
-
1467
- Raises:
1468
- Exception: If job initiation fails
1469
- """
1470
- # Validate any additional kwargs
1471
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1472
-
1473
- scrape_params = {}
1474
-
1475
- # Add individual parameters
1476
- if formats is not None:
1477
- scrape_params['formats'] = formats
1478
- if headers is not None:
1479
- scrape_params['headers'] = headers
1480
- if include_tags is not None:
1481
- scrape_params['includeTags'] = include_tags
1482
- if exclude_tags is not None:
1483
- scrape_params['excludeTags'] = exclude_tags
1484
- if only_main_content is not None:
1485
- scrape_params['onlyMainContent'] = only_main_content
1486
- if wait_for is not None:
1487
- scrape_params['waitFor'] = wait_for
1488
- if timeout is not None:
1489
- scrape_params['timeout'] = timeout
1490
- if location is not None:
1491
- scrape_params['location'] = location.dict(exclude_none=True)
1492
- if mobile is not None:
1493
- scrape_params['mobile'] = mobile
1494
- if skip_tls_verification is not None:
1495
- scrape_params['skipTlsVerification'] = skip_tls_verification
1496
- if remove_base64_images is not None:
1497
- scrape_params['removeBase64Images'] = remove_base64_images
1498
- if block_ads is not None:
1499
- scrape_params['blockAds'] = block_ads
1500
- if proxy is not None:
1501
- scrape_params['proxy'] = proxy
1502
- if extract is not None:
1503
- extract = self._ensure_schema_dict(extract)
1504
- if isinstance(extract, dict) and "schema" in extract:
1505
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1506
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1507
- if json_options is not None:
1508
- json_options = self._ensure_schema_dict(json_options)
1509
- if isinstance(json_options, dict) and "schema" in json_options:
1510
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1511
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1512
- if actions is not None:
1513
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1514
- if agent is not None:
1515
- scrape_params['agent'] = agent.dict(exclude_none=True)
1516
- if max_concurrency is not None:
1517
- scrape_params['maxConcurrency'] = max_concurrency
1518
- if zero_data_retention is not None:
1519
- scrape_params['zeroDataRetention'] = zero_data_retention
1520
-
1521
- # Add any additional kwargs
1522
- scrape_params.update(kwargs)
1523
-
1524
- # Create final params object
1525
- final_params = ScrapeParams(**scrape_params)
1526
- params_dict = final_params.dict(exclude_none=True)
1527
- params_dict['urls'] = urls
1528
- params_dict['origin'] = f"python-sdk@{version}"
1529
-
1530
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1531
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1532
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1533
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1534
-
1535
- # Make request
1536
- headers = self._prepare_headers(idempotency_key)
1537
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1538
-
1539
- if response.status_code == 200:
1540
- try:
1541
- return BatchScrapeResponse(**response.json())
1542
- except:
1543
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1544
- else:
1545
- self._handle_error(response, 'start batch scrape job')
1546
-
1547
- def batch_scrape_urls_and_watch(
1548
- self,
1549
- urls: List[str],
1550
- *,
1551
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1552
- headers: Optional[Dict[str, str]] = None,
1553
- include_tags: Optional[List[str]] = None,
1554
- exclude_tags: Optional[List[str]] = None,
1555
- only_main_content: Optional[bool] = None,
1556
- wait_for: Optional[int] = None,
1557
- timeout: Optional[int] = None,
1558
- location: Optional[LocationConfig] = None,
1559
- mobile: Optional[bool] = None,
1560
- skip_tls_verification: Optional[bool] = None,
1561
- remove_base64_images: Optional[bool] = None,
1562
- block_ads: Optional[bool] = None,
1563
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1564
- extract: Optional[JsonConfig] = None,
1565
- json_options: Optional[JsonConfig] = None,
1566
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1567
- agent: Optional[AgentOptions] = None,
1568
- max_concurrency: Optional[int] = None,
1569
- zero_data_retention: Optional[bool] = None,
1570
- idempotency_key: Optional[str] = None,
1571
- **kwargs
1572
- ) -> 'CrawlWatcher':
1573
- """
1574
- Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1575
-
1576
- Args:
1577
- urls (List[str]): URLs to scrape
1578
- formats (Optional[List[Literal]]): Content formats to retrieve
1579
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1580
- include_tags (Optional[List[str]]): HTML tags to include
1581
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1582
- only_main_content (Optional[bool]): Extract main content only
1583
- wait_for (Optional[int]): Wait time in milliseconds
1584
- timeout (Optional[int]): Request timeout in milliseconds
1585
- location (Optional[LocationConfig]): Location configuration
1586
- mobile (Optional[bool]): Use mobile user agent
1587
- skip_tls_verification (Optional[bool]): Skip TLS verification
1588
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1589
- block_ads (Optional[bool]): Block advertisements
1590
- proxy (Optional[Literal]): Proxy type to use
1591
- extract (Optional[JsonConfig]): Content extraction config
1592
- json_options (Optional[JsonConfig]): JSON extraction config
1593
- actions (Optional[List[Union]]): Actions to perform
1594
- agent (Optional[AgentOptions]): Agent configuration
1595
- max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1596
- zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
1597
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1598
- **kwargs: Additional parameters to pass to the API
1599
-
1600
- Returns:
1601
- CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1602
-
1603
- Raises:
1604
- Exception: If batch scrape job fails to start
1605
- """
1606
- # Validate any additional kwargs
1607
- self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1608
-
1609
- scrape_params = {}
1610
-
1611
- # Add individual parameters
1612
- if formats is not None:
1613
- scrape_params['formats'] = formats
1614
- if headers is not None:
1615
- scrape_params['headers'] = headers
1616
- if include_tags is not None:
1617
- scrape_params['includeTags'] = include_tags
1618
- if exclude_tags is not None:
1619
- scrape_params['excludeTags'] = exclude_tags
1620
- if only_main_content is not None:
1621
- scrape_params['onlyMainContent'] = only_main_content
1622
- if wait_for is not None:
1623
- scrape_params['waitFor'] = wait_for
1624
- if timeout is not None:
1625
- scrape_params['timeout'] = timeout
1626
- if location is not None:
1627
- scrape_params['location'] = location.dict(exclude_none=True)
1628
- if mobile is not None:
1629
- scrape_params['mobile'] = mobile
1630
- if skip_tls_verification is not None:
1631
- scrape_params['skipTlsVerification'] = skip_tls_verification
1632
- if remove_base64_images is not None:
1633
- scrape_params['removeBase64Images'] = remove_base64_images
1634
- if block_ads is not None:
1635
- scrape_params['blockAds'] = block_ads
1636
- if proxy is not None:
1637
- scrape_params['proxy'] = proxy
1638
- if extract is not None:
1639
- extract = self._ensure_schema_dict(extract)
1640
- if isinstance(extract, dict) and "schema" in extract:
1641
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
1642
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
1643
- if json_options is not None:
1644
- json_options = self._ensure_schema_dict(json_options)
1645
- if isinstance(json_options, dict) and "schema" in json_options:
1646
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
1647
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
1648
- if actions is not None:
1649
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1650
- if agent is not None:
1651
- scrape_params['agent'] = agent.dict(exclude_none=True)
1652
- if max_concurrency is not None:
1653
- scrape_params['maxConcurrency'] = max_concurrency
1654
- if zero_data_retention is not None:
1655
- scrape_params['zeroDataRetention'] = zero_data_retention
1656
-
1657
- # Add any additional kwargs
1658
- scrape_params.update(kwargs)
1659
-
1660
- # Create final params object
1661
- final_params = ScrapeParams(**scrape_params)
1662
- params_dict = final_params.dict(exclude_none=True)
1663
- params_dict['urls'] = urls
1664
- params_dict['origin'] = f"python-sdk@{version}"
1665
-
1666
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
1667
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
1668
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
1669
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
1670
-
1671
- # Make request
1672
- headers = self._prepare_headers(idempotency_key)
1673
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1674
-
1675
- if response.status_code == 200:
1676
- try:
1677
- crawl_response = BatchScrapeResponse(**response.json())
1678
- if crawl_response.success and crawl_response.id:
1679
- return CrawlWatcher(crawl_response.id, self)
1680
- else:
1681
- raise Exception("Batch scrape job failed to start")
1682
- except:
1683
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1684
- else:
1685
- self._handle_error(response, 'start batch scrape job')
1686
-
1687
- def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1688
- """
1689
- Check the status of a batch scrape job using the Firecrawl API.
1690
-
1691
- Args:
1692
- id (str): The ID of the batch scrape job.
1693
-
1694
- Returns:
1695
- BatchScrapeStatusResponse: The status of the batch scrape job.
1696
-
1697
- Raises:
1698
- Exception: If the status check request fails.
1699
- """
1700
- endpoint = f'/v1/batch/scrape/{id}'
1701
-
1702
- headers = self._prepare_headers()
1703
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
1704
- if response.status_code == 200:
1705
- try:
1706
- status_data = response.json()
1707
- except:
1708
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1709
- if status_data['status'] == 'completed':
1710
- if 'data' in status_data:
1711
- data = status_data['data']
1712
- while 'next' in status_data:
1713
- if len(status_data['data']) == 0:
1714
- break
1715
- next_url = status_data.get('next')
1716
- if not next_url:
1717
- logger.warning("Expected 'next' URL is missing.")
1718
- break
1719
- try:
1720
- status_response = self._get_request(next_url, headers)
1721
- if status_response.status_code != 200:
1722
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
1723
- break
1724
- try:
1725
- next_data = status_response.json()
1726
- except:
1727
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1728
- data.extend(next_data.get('data', []))
1729
- status_data = next_data
1730
- except Exception as e:
1731
- logger.error(f"Error during pagination request: {e}")
1732
- break
1733
- status_data['data'] = data
1734
-
1735
- return BatchScrapeStatusResponse(**{
1736
- 'success': False if 'error' in status_data else True,
1737
- 'status': status_data.get('status'),
1738
- 'total': status_data.get('total'),
1739
- 'completed': status_data.get('completed'),
1740
- 'creditsUsed': status_data.get('creditsUsed'),
1741
- 'expiresAt': status_data.get('expiresAt'),
1742
- 'data': status_data.get('data'),
1743
- 'next': status_data.get('next'),
1744
- 'error': status_data.get('error')
1745
- })
1746
- else:
1747
- self._handle_error(response, 'check batch scrape status')
1748
-
1749
- def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1750
- """
1751
- Returns information about batch scrape errors.
1752
-
1753
- Args:
1754
- id (str): The ID of the crawl job.
1755
-
1756
- Returns:
1757
- CrawlErrorsResponse containing:
1758
- * errors (List[Dict[str, str]]): List of errors with fields:
1759
- * id (str): Error ID
1760
- * timestamp (str): When the error occurred
1761
- * url (str): URL that caused the error
1762
- * error (str): Error message
1763
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1764
-
1765
- Raises:
1766
- Exception: If the error check request fails
1767
- """
1768
- headers = self._prepare_headers()
1769
- response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1770
- if response.status_code == 200:
1771
- try:
1772
- return CrawlErrorsResponse(**response.json())
1773
- except:
1774
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1775
- else:
1776
- self._handle_error(response, "check batch scrape errors")
1777
-
1778
- def extract(
1779
- self,
1780
- urls: Optional[List[str]] = None,
1781
- *,
1782
- prompt: Optional[str] = None,
1783
- schema: Optional[Any] = None,
1784
- system_prompt: Optional[str] = None,
1785
- allow_external_links: Optional[bool] = False,
1786
- enable_web_search: Optional[bool] = False,
1787
- show_sources: Optional[bool] = False,
1788
- agent: Optional[Dict[str, Any]] = None,
1789
- **kwargs) -> ExtractResponse[Any]:
1790
- """
1791
- Extract structured information from URLs.
1792
-
1793
- Args:
1794
- urls (Optional[List[str]]): URLs to extract from
1795
- prompt (Optional[str]): Custom extraction prompt
1796
- schema (Optional[Any]): JSON schema/Pydantic model
1797
- system_prompt (Optional[str]): System context
1798
- allow_external_links (Optional[bool]): Follow external links
1799
- enable_web_search (Optional[bool]): Enable web search
1800
- show_sources (Optional[bool]): Include source URLs
1801
- agent (Optional[Dict[str, Any]]): Agent configuration
1802
- **kwargs: Additional parameters to pass to the API
1803
-
1804
- Returns:
1805
- ExtractResponse[Any] with:
1806
- * success (bool): Whether request succeeded
1807
- * data (Optional[Any]): Extracted data matching schema
1808
- * error (Optional[str]): Error message if any
1809
-
1810
- Raises:
1811
- ValueError: If prompt/schema missing or extraction fails
1812
- """
1813
- # Validate any additional kwargs
1814
- self._validate_kwargs(kwargs, "extract")
1815
-
1816
- headers = self._prepare_headers()
1817
-
1818
- if not prompt and not schema:
1819
- raise ValueError("Either prompt or schema is required")
1820
-
1821
- if not urls and not prompt:
1822
- raise ValueError("Either urls or prompt is required")
1823
-
1824
- if schema:
1825
- schema = self._ensure_schema_dict(schema)
1826
-
1827
- request_data = {
1828
- 'urls': urls or [],
1829
- 'allowExternalLinks': allow_external_links,
1830
- 'enableWebSearch': enable_web_search,
1831
- 'showSources': show_sources,
1832
- 'schema': schema,
1833
- 'origin': f'python-sdk@{get_version()}'
1834
- }
1835
-
1836
- # Only add prompt and systemPrompt if they exist
1837
- if prompt:
1838
- request_data['prompt'] = prompt
1839
- if system_prompt:
1840
- request_data['systemPrompt'] = system_prompt
1841
-
1842
- if agent:
1843
- request_data['agent'] = agent
1844
-
1845
- # Add any additional kwargs
1846
- request_data.update(kwargs)
1847
-
1848
- try:
1849
- # Send the initial extract request
1850
- response = self._post_request(
1851
- f'{self.api_url}/v1/extract',
1852
- request_data,
1853
- headers
1854
- )
1855
- if response.status_code == 200:
1856
- try:
1857
- data = response.json()
1858
- except:
1859
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1860
- if data['success']:
1861
- job_id = data.get('id')
1862
- if not job_id:
1863
- raise Exception('Job ID not returned from extract request.')
1864
-
1865
- # Poll for the extract status
1866
- while True:
1867
- status_response = self._get_request(
1868
- f'{self.api_url}/v1/extract/{job_id}',
1869
- headers
1870
- )
1871
- if status_response.status_code == 200:
1872
- try:
1873
- status_data = status_response.json()
1874
- except:
1875
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1876
- if status_data['status'] == 'completed':
1877
- return ExtractResponse(**status_data)
1878
- elif status_data['status'] in ['failed', 'cancelled']:
1879
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1880
- else:
1881
- self._handle_error(status_response, "extract-status")
1882
-
1883
- time.sleep(2) # Polling interval
1884
- else:
1885
- raise Exception(f'Failed to extract. Error: {data["error"]}')
1886
- else:
1887
- self._handle_error(response, "extract")
1888
- except Exception as e:
1889
- raise ValueError(str(e), 500)
1890
-
1891
- return ExtractResponse(success=False, error="Internal server error.")
1892
-
1893
- def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1894
- """
1895
- Retrieve the status of an extract job.
1896
-
1897
- Args:
1898
- job_id (str): The ID of the extract job.
1899
-
1900
- Returns:
1901
- ExtractResponse[Any]: The status of the extract job.
1902
-
1903
- Raises:
1904
- ValueError: If there is an error retrieving the status.
1905
- """
1906
- headers = self._prepare_headers()
1907
- try:
1908
- response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1909
- if response.status_code == 200:
1910
- try:
1911
- return ExtractResponse(**response.json())
1912
- except:
1913
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1914
- else:
1915
- self._handle_error(response, "get extract status")
1916
- except Exception as e:
1917
- raise ValueError(str(e), 500)
1918
-
1919
- def async_extract(
1920
- self,
1921
- urls: Optional[List[str]] = None,
1922
- *,
1923
- prompt: Optional[str] = None,
1924
- schema: Optional[Any] = None,
1925
- system_prompt: Optional[str] = None,
1926
- allow_external_links: Optional[bool] = False,
1927
- enable_web_search: Optional[bool] = False,
1928
- show_sources: Optional[bool] = False,
1929
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1930
- """
1931
- Initiate an asynchronous extract job.
1932
-
1933
- Args:
1934
- urls (List[str]): URLs to extract information from
1935
- prompt (Optional[str]): Custom extraction prompt
1936
- schema (Optional[Any]): JSON schema/Pydantic model
1937
- system_prompt (Optional[str]): System context
1938
- allow_external_links (Optional[bool]): Follow external links
1939
- enable_web_search (Optional[bool]): Enable web search
1940
- show_sources (Optional[bool]): Include source URLs
1941
- agent (Optional[Dict[str, Any]]): Agent configuration
1942
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1943
-
1944
- Returns:
1945
- ExtractResponse[Any] with:
1946
- * success (bool): Whether request succeeded
1947
- * data (Optional[Any]): Extracted data matching schema
1948
- * error (Optional[str]): Error message if any
1949
-
1950
- Raises:
1951
- ValueError: If job initiation fails
1952
- """
1953
- headers = self._prepare_headers()
1954
-
1955
- schema = schema
1956
- if schema:
1957
- schema = self._ensure_schema_dict(schema)
1958
-
1959
- request_data = {
1960
- 'urls': urls,
1961
- 'allowExternalLinks': allow_external_links,
1962
- 'enableWebSearch': enable_web_search,
1963
- 'showSources': show_sources,
1964
- 'schema': schema,
1965
- 'origin': f'python-sdk@{version}'
1966
- }
1967
-
1968
- if prompt:
1969
- request_data['prompt'] = prompt
1970
- if system_prompt:
1971
- request_data['systemPrompt'] = system_prompt
1972
- if agent:
1973
- request_data['agent'] = agent
1974
-
1975
- try:
1976
- response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1977
- if response.status_code == 200:
1978
- try:
1979
- return ExtractResponse(**response.json())
1980
- except:
1981
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1982
- else:
1983
- self._handle_error(response, "async extract")
1984
- except Exception as e:
1985
- raise ValueError(str(e), 500)
1986
-
1987
- def generate_llms_text(
1988
- self,
1989
- url: str,
1990
- *,
1991
- max_urls: Optional[int] = None,
1992
- show_full_text: Optional[bool] = None,
1993
- cache: Optional[bool] = None,
1994
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1995
- """
1996
- Generate LLMs.txt for a given URL and poll until completion.
1997
-
1998
- Args:
1999
- url (str): Target URL to generate LLMs.txt from
2000
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
2001
- show_full_text (Optional[bool]): Include full text in output (default: False)
2002
- cache (Optional[bool]): Whether to use cached content if available (default: True)
2003
- experimental_stream (Optional[bool]): Enable experimental streaming
2004
-
2005
- Returns:
2006
- GenerateLLMsTextStatusResponse with:
2007
- * Generated LLMs.txt content
2008
- * Full version if requested
2009
- * Generation status
2010
- * Success/error information
2011
-
2012
- Raises:
2013
- Exception: If generation fails
2014
- """
2015
- params = GenerateLLMsTextParams(
2016
- maxUrls=max_urls,
2017
- showFullText=show_full_text,
2018
- cache=cache,
2019
- __experimental_stream=experimental_stream
2020
- )
2021
-
2022
- response = self.async_generate_llms_text(
2023
- url,
2024
- max_urls=max_urls,
2025
- show_full_text=show_full_text,
2026
- cache=cache,
2027
- experimental_stream=experimental_stream
2028
- )
2029
-
2030
- if not response.success or not response.id:
2031
- return GenerateLLMsTextStatusResponse(
2032
- success=False,
2033
- error='Failed to start LLMs.txt generation',
2034
- status='failed',
2035
- expiresAt=''
2036
- )
2037
-
2038
- job_id = response.id
2039
- while True:
2040
- status = self.check_generate_llms_text_status(job_id)
2041
-
2042
- if status.status == 'completed':
2043
- return status
2044
- elif status.status == 'failed':
2045
- return status
2046
- elif status.status != 'processing':
2047
- return GenerateLLMsTextStatusResponse(
2048
- success=False,
2049
- error='LLMs.txt generation job terminated unexpectedly',
2050
- status='failed',
2051
- expiresAt=''
2052
- )
2053
-
2054
- time.sleep(2) # Polling interval
2055
-
2056
- def async_generate_llms_text(
2057
- self,
2058
- url: str,
2059
- *,
2060
- max_urls: Optional[int] = None,
2061
- show_full_text: Optional[bool] = None,
2062
- cache: Optional[bool] = None,
2063
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
2064
- """
2065
- Initiate an asynchronous LLMs.txt generation operation.
2066
-
2067
- Args:
2068
- url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
2069
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
2070
- show_full_text (Optional[bool]): Include full text in output (default: False)
2071
- cache (Optional[bool]): Whether to use cached content if available (default: True)
2072
- experimental_stream (Optional[bool]): Enable experimental streaming
2073
-
2074
- Returns:
2075
- GenerateLLMsTextResponse: A response containing:
2076
- * success (bool): Whether the generation initiation was successful
2077
- * id (str): The unique identifier for the generation job
2078
- * error (str, optional): Error message if initiation failed
2079
-
2080
- Raises:
2081
- Exception: If the generation job initiation fails.
2082
- """
2083
- params = GenerateLLMsTextParams(
2084
- maxUrls=max_urls,
2085
- showFullText=show_full_text,
2086
- cache=cache,
2087
- __experimental_stream=experimental_stream
2088
- )
2089
-
2090
- headers = self._prepare_headers()
2091
- json_data = {'url': url, **params.dict(exclude_none=True)}
2092
- json_data['origin'] = f"python-sdk@{version}"
2093
-
2094
- try:
2095
- req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
2096
- response = req.json()
2097
- print("json_data", json_data)
2098
- print("response", response)
2099
- if response.get('success'):
2100
- try:
2101
- return GenerateLLMsTextResponse(**response)
2102
- except:
2103
- raise Exception('Failed to parse Firecrawl response as JSON.')
2104
- else:
2105
- self._handle_error(response, 'start LLMs.txt generation')
2106
- except Exception as e:
2107
- raise ValueError(str(e))
2108
-
2109
- return GenerateLLMsTextResponse(
2110
- success=False,
2111
- error='Internal server error'
2112
- )
2113
-
2114
- def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
2115
- """
2116
- Check the status of a LLMs.txt generation operation.
2117
-
2118
- Args:
2119
- id (str): The unique identifier of the LLMs.txt generation job to check status for.
2120
-
2121
- Returns:
2122
- GenerateLLMsTextStatusResponse: A response containing:
2123
- * success (bool): Whether the generation was successful
2124
- * status (str): Status of generation ("processing", "completed", "failed")
2125
- * data (Dict[str, str], optional): Generated text with fields:
2126
- * llmstxt (str): Generated LLMs.txt content
2127
- * llmsfulltxt (str, optional): Full version if requested
2128
- * error (str, optional): Error message if generation failed
2129
- * expiresAt (str): When the generated data expires
2130
-
2131
- Raises:
2132
- Exception: If the status check fails.
2133
- """
2134
- headers = self._prepare_headers()
2135
- try:
2136
- response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
2137
- if response.status_code == 200:
2138
- try:
2139
- json_data = response.json()
2140
- return GenerateLLMsTextStatusResponse(**json_data)
2141
- except Exception as e:
2142
- raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
2143
- elif response.status_code == 404:
2144
- raise Exception('LLMs.txt generation job not found')
2145
- else:
2146
- self._handle_error(response, 'check LLMs.txt generation status')
2147
- except Exception as e:
2148
- raise ValueError(str(e))
2149
-
2150
- return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
2151
-
2152
- def _prepare_headers(
2153
- self,
2154
- idempotency_key: Optional[str] = None) -> Dict[str, str]:
2155
- """
2156
- Prepare the headers for API requests.
2157
-
2158
- Args:
2159
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
2160
-
2161
- Returns:
2162
- Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
2163
- """
2164
- if idempotency_key:
2165
- return {
2166
- 'Content-Type': 'application/json',
2167
- 'Authorization': f'Bearer {self.api_key}',
2168
- 'x-idempotency-key': idempotency_key
2169
- }
2170
-
2171
- return {
2172
- 'Content-Type': 'application/json',
2173
- 'Authorization': f'Bearer {self.api_key}',
2174
- }
2175
-
2176
- def _post_request(
2177
- self,
2178
- url: str,
2179
- data: Dict[str, Any],
2180
- headers: Dict[str, str],
2181
- retries: int = 3,
2182
- backoff_factor: float = 0.5) -> requests.Response:
2183
- """
2184
- Make a POST request with retries.
2185
-
2186
- Args:
2187
- url (str): The URL to send the POST request to.
2188
- data (Dict[str, Any]): The JSON data to include in the POST request.
2189
- headers (Dict[str, str]): The headers to include in the POST request.
2190
- retries (int): Number of retries for the request.
2191
- backoff_factor (float): Backoff factor for retries.
2192
-
2193
- Returns:
2194
- requests.Response: The response from the POST request.
2195
-
2196
- Raises:
2197
- requests.RequestException: If the request fails after the specified retries.
2198
- """
2199
- for attempt in range(retries):
2200
- response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2201
- if response.status_code == 502:
2202
- time.sleep(backoff_factor * (2 ** attempt))
2203
- else:
2204
- return response
2205
- return response
2206
-
2207
- def _get_request(
2208
- self,
2209
- url: str,
2210
- headers: Dict[str, str],
2211
- retries: int = 3,
2212
- backoff_factor: float = 0.5) -> requests.Response:
2213
- """
2214
- Make a GET request with retries.
2215
-
2216
- Args:
2217
- url (str): The URL to send the GET request to.
2218
- headers (Dict[str, str]): The headers to include in the GET request.
2219
- retries (int): Number of retries for the request.
2220
- backoff_factor (float): Backoff factor for retries.
2221
-
2222
- Returns:
2223
- requests.Response: The response from the GET request.
2224
-
2225
- Raises:
2226
- requests.RequestException: If the request fails after the specified retries.
2227
- """
2228
- for attempt in range(retries):
2229
- response = requests.get(url, headers=headers)
2230
- if response.status_code == 502:
2231
- time.sleep(backoff_factor * (2 ** attempt))
2232
- else:
2233
- return response
2234
- return response
2235
-
2236
- def _delete_request(
2237
- self,
2238
- url: str,
2239
- headers: Dict[str, str],
2240
- retries: int = 3,
2241
- backoff_factor: float = 0.5) -> requests.Response:
2242
- """
2243
- Make a DELETE request with retries.
2244
-
2245
- Args:
2246
- url (str): The URL to send the DELETE request to.
2247
- headers (Dict[str, str]): The headers to include in the DELETE request.
2248
- retries (int): Number of retries for the request.
2249
- backoff_factor (float): Backoff factor for retries.
2250
-
2251
- Returns:
2252
- requests.Response: The response from the DELETE request.
2253
-
2254
- Raises:
2255
- requests.RequestException: If the request fails after the specified retries.
2256
- """
2257
- for attempt in range(retries):
2258
- response = requests.delete(url, headers=headers)
2259
- if response.status_code == 502:
2260
- time.sleep(backoff_factor * (2 ** attempt))
2261
- else:
2262
- return response
2263
- return response
2264
-
2265
- def _monitor_job_status(
2266
- self,
2267
- id: str,
2268
- headers: Dict[str, str],
2269
- poll_interval: int) -> CrawlStatusResponse:
2270
- """
2271
- Monitor the status of a crawl job until completion.
2272
-
2273
- Args:
2274
- id (str): The ID of the crawl job.
2275
- headers (Dict[str, str]): The headers to include in the status check requests.
2276
- poll_interval (int): Seconds between status checks.
2277
-
2278
- Returns:
2279
- CrawlStatusResponse: The crawl results if the job is completed successfully.
2280
-
2281
- Raises:
2282
- Exception: If the job fails or an error occurs during status checks.
2283
- """
2284
- while True:
2285
- api_url = f'{self.api_url}/v1/crawl/{id}'
2286
-
2287
- status_response = self._get_request(api_url, headers)
2288
- if status_response.status_code == 200:
2289
- try:
2290
- status_data = status_response.json()
2291
- except:
2292
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2293
- if status_data['status'] == 'completed':
2294
- if 'data' in status_data:
2295
- data = status_data['data']
2296
- while 'next' in status_data:
2297
- if len(status_data['data']) == 0:
2298
- break
2299
- status_response = self._get_request(status_data['next'], headers)
2300
- try:
2301
- status_data = status_response.json()
2302
- except:
2303
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2304
- data.extend(status_data.get('data', []))
2305
- status_data['data'] = data
2306
- return CrawlStatusResponse(**status_data)
2307
- else:
2308
- raise Exception('Crawl job completed but no data was returned')
2309
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2310
- poll_interval=max(poll_interval,2)
2311
- time.sleep(poll_interval) # Wait for the specified interval before checking again
2312
- else:
2313
- raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2314
- else:
2315
- self._handle_error(status_response, 'check crawl status')
2316
-
2317
- def _handle_error(
2318
- self,
2319
- response: requests.Response,
2320
- action: str) -> None:
2321
- """
2322
- Handle errors from API responses.
2323
-
2324
- Args:
2325
- response (requests.Response): The response object from the API request.
2326
- action (str): Description of the action that was being performed.
2327
-
2328
- Raises:
2329
- Exception: An exception with a message containing the status code and error details from the response.
2330
- """
2331
- try:
2332
- error_message = response.json().get('error', 'No error message provided.')
2333
- error_details = response.json().get('details', 'No additional error details provided.')
2334
- except:
2335
- raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2336
-
2337
- message = self._get_error_message(response.status_code, action, error_message, error_details)
2338
-
2339
- # Raise an HTTPError with the custom message and attach the response
2340
- raise requests.exceptions.HTTPError(message, response=response)
2341
-
2342
- def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2343
- """
2344
- Generate a standardized error message based on HTTP status code.
2345
-
2346
- Args:
2347
- status_code (int): The HTTP status code from the response
2348
- action (str): Description of the action that was being performed
2349
- error_message (str): The error message from the API response
2350
- error_details (str): Additional error details from the API response
2351
-
2352
- Returns:
2353
- str: A formatted error message
2354
- """
2355
- if status_code == 402:
2356
- return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2357
- elif status_code == 403:
2358
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2359
- elif status_code == 408:
2360
- return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2361
- elif status_code == 409:
2362
- return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2363
- elif status_code == 500:
2364
- return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2365
- else:
2366
- return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2367
-
2368
- def deep_research(
2369
- self,
2370
- query: str,
2371
- *,
2372
- max_depth: Optional[int] = None,
2373
- time_limit: Optional[int] = None,
2374
- max_urls: Optional[int] = None,
2375
- analysis_prompt: Optional[str] = None,
2376
- system_prompt: Optional[str] = None,
2377
- __experimental_stream_steps: Optional[bool] = None,
2378
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2379
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2380
- """
2381
- Initiates a deep research operation on a given query and polls until completion.
2382
-
2383
- Args:
2384
- query (str): Research query or topic to investigate
2385
- max_depth (Optional[int]): Maximum depth of research exploration
2386
- time_limit (Optional[int]): Time limit in seconds for research
2387
- max_urls (Optional[int]): Maximum number of URLs to process
2388
- analysis_prompt (Optional[str]): Custom prompt for analysis
2389
- system_prompt (Optional[str]): Custom system prompt
2390
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2391
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2392
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2393
-
2394
- Returns:
2395
- DeepResearchStatusResponse containing:
2396
- * success (bool): Whether research completed successfully
2397
- * status (str): Current state (processing/completed/failed)
2398
- * error (Optional[str]): Error message if failed
2399
- * id (str): Unique identifier for the research job
2400
- * data (Any): Research findings and analysis
2401
- * sources (List[Dict]): List of discovered sources
2402
- * activities (List[Dict]): Research progress log
2403
- * summaries (List[str]): Generated research summaries
2404
-
2405
- Raises:
2406
- Exception: If research fails
2407
- """
2408
- research_params = {}
2409
- if max_depth is not None:
2410
- research_params['maxDepth'] = max_depth
2411
- if time_limit is not None:
2412
- research_params['timeLimit'] = time_limit
2413
- if max_urls is not None:
2414
- research_params['maxUrls'] = max_urls
2415
- if analysis_prompt is not None:
2416
- research_params['analysisPrompt'] = analysis_prompt
2417
- if system_prompt is not None:
2418
- research_params['systemPrompt'] = system_prompt
2419
- if __experimental_stream_steps is not None:
2420
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2421
- research_params = DeepResearchParams(**research_params)
2422
-
2423
- response = self.async_deep_research(
2424
- query,
2425
- max_depth=max_depth,
2426
- time_limit=time_limit,
2427
- max_urls=max_urls,
2428
- analysis_prompt=analysis_prompt,
2429
- system_prompt=system_prompt
2430
- )
2431
- if not response.get('success') or 'id' not in response:
2432
- return response
2433
-
2434
- job_id = response['id']
2435
- last_activity_count = 0
2436
- last_source_count = 0
2437
-
2438
- while True:
2439
- status = self.check_deep_research_status(job_id)
2440
-
2441
- if on_activity and 'activities' in status:
2442
- new_activities = status['activities'][last_activity_count:]
2443
- for activity in new_activities:
2444
- on_activity(activity)
2445
- last_activity_count = len(status['activities'])
2446
-
2447
- if on_source and 'sources' in status:
2448
- new_sources = status['sources'][last_source_count:]
2449
- for source in new_sources:
2450
- on_source(source)
2451
- last_source_count = len(status['sources'])
2452
-
2453
- if status['status'] == 'completed':
2454
- return status
2455
- elif status['status'] == 'failed':
2456
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
2457
- elif status['status'] != 'processing':
2458
- break
2459
-
2460
- time.sleep(2) # Polling interval
2461
-
2462
- return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2463
-
2464
- def async_deep_research(
2465
- self,
2466
- query: str,
2467
- *,
2468
- max_depth: Optional[int] = None,
2469
- time_limit: Optional[int] = None,
2470
- max_urls: Optional[int] = None,
2471
- analysis_prompt: Optional[str] = None,
2472
- system_prompt: Optional[str] = None,
2473
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2474
- """
2475
- Initiates an asynchronous deep research operation.
2476
-
2477
- Args:
2478
- query (str): Research query or topic to investigate
2479
- max_depth (Optional[int]): Maximum depth of research exploration
2480
- time_limit (Optional[int]): Time limit in seconds for research
2481
- max_urls (Optional[int]): Maximum number of URLs to process
2482
- analysis_prompt (Optional[str]): Custom prompt for analysis
2483
- system_prompt (Optional[str]): Custom system prompt
2484
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2485
-
2486
- Returns:
2487
- Dict[str, Any]: A response containing:
2488
- * success (bool): Whether the research initiation was successful
2489
- * id (str): The unique identifier for the research job
2490
- * error (str, optional): Error message if initiation failed
2491
-
2492
- Raises:
2493
- Exception: If the research initiation fails.
2494
- """
2495
- research_params = {}
2496
- if max_depth is not None:
2497
- research_params['maxDepth'] = max_depth
2498
- if time_limit is not None:
2499
- research_params['timeLimit'] = time_limit
2500
- if max_urls is not None:
2501
- research_params['maxUrls'] = max_urls
2502
- if analysis_prompt is not None:
2503
- research_params['analysisPrompt'] = analysis_prompt
2504
- if system_prompt is not None:
2505
- research_params['systemPrompt'] = system_prompt
2506
- if __experimental_stream_steps is not None:
2507
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2508
- research_params = DeepResearchParams(**research_params)
2509
-
2510
- headers = self._prepare_headers()
2511
-
2512
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
2513
- json_data['origin'] = f"python-sdk@{version}"
2514
-
2515
- # Handle json options schema if present
2516
- if 'jsonOptions' in json_data:
2517
- json_opts = json_data['jsonOptions']
2518
- if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2519
- json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2520
-
2521
- try:
2522
- response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2523
- if response.status_code == 200:
2524
- try:
2525
- return response.json()
2526
- except:
2527
- raise Exception('Failed to parse Firecrawl response as JSON.')
2528
- else:
2529
- self._handle_error(response, 'start deep research')
2530
- except Exception as e:
2531
- raise ValueError(str(e))
2532
-
2533
- return {'success': False, 'error': 'Internal server error'}
2534
-
2535
- def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2536
- """
2537
- Check the status of a deep research operation.
2538
-
2539
- Args:
2540
- id (str): The ID of the deep research operation.
2541
-
2542
- Returns:
2543
- DeepResearchResponse containing:
2544
-
2545
- Status:
2546
- * success - Whether research completed successfully
2547
- * status - Current state (processing/completed/failed)
2548
- * error - Error message if failed
2549
-
2550
- Results:
2551
- * id - Unique identifier for the research job
2552
- * data - Research findings and analysis
2553
- * sources - List of discovered sources
2554
- * activities - Research progress log
2555
- * summaries - Generated research summaries
2556
-
2557
- Raises:
2558
- Exception: If the status check fails.
2559
- """
2560
- headers = self._prepare_headers()
2561
- try:
2562
- response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2563
- if response.status_code == 200:
2564
- try:
2565
- return response.json()
2566
- except:
2567
- raise Exception('Failed to parse Firecrawl response as JSON.')
2568
- elif response.status_code == 404:
2569
- raise Exception('Deep research job not found')
2570
- else:
2571
- self._handle_error(response, 'check deep research status')
2572
- except Exception as e:
2573
- raise ValueError(str(e))
2574
-
2575
- return {'success': False, 'error': 'Internal server error'}
2576
-
2577
- def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2578
- """
2579
- Validate additional keyword arguments before they are passed to the API.
2580
- This provides early validation before the Pydantic model validation.
2581
-
2582
- Args:
2583
- kwargs (Dict[str, Any]): Additional keyword arguments to validate
2584
- method_name (str): Name of the method these kwargs are for
2585
-
2586
- Raises:
2587
- ValueError: If kwargs contain invalid or unsupported parameters
2588
- """
2589
- if not kwargs:
2590
- return
2591
-
2592
- # Known parameter mappings for each method
2593
- method_params = {
2594
- "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2595
- "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2596
- "block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "integration"},
2597
- "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
2598
- "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2599
- "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2600
- "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url", "integration"},
2601
- "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout", "integration"},
2602
- "extract": {"prompt", "schema", "system_prompt", "allow_external_links", "enable_web_search", "show_sources", "agent", "integration"},
2603
- "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2604
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2605
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2606
- "actions", "agent", "webhook"},
2607
- "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2608
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2609
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2610
- "actions", "agent", "webhook"},
2611
- "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2612
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2613
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2614
- "actions", "agent", "webhook"}
2615
- }
2616
-
2617
- # Get allowed parameters for this method
2618
- allowed_params = method_params.get(method_name, set())
2619
-
2620
- # Check for unknown parameters
2621
- unknown_params = set(kwargs.keys()) - allowed_params
2622
- if unknown_params:
2623
- raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2624
-
2625
- # Additional type validation can be added here if needed
2626
- # For now, we rely on Pydantic models for detailed type validation
2627
-
2628
- def _ensure_schema_dict(self, schema):
2629
- """
2630
- Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
2631
- """
2632
- if schema is None:
2633
- return schema
2634
- if isinstance(schema, type):
2635
- # Pydantic v1/v2 model class
2636
- if hasattr(schema, 'model_json_schema'):
2637
- return schema.model_json_schema()
2638
- elif hasattr(schema, 'schema'):
2639
- return schema.schema()
2640
- if isinstance(schema, dict):
2641
- return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
2642
- if isinstance(schema, (list, tuple)):
2643
- return [self._ensure_schema_dict(v) for v in schema]
2644
- return schema
2645
-
2646
- class CrawlWatcher:
2647
- """
2648
- A class to watch and handle crawl job events via WebSocket connection.
2649
-
2650
- Attributes:
2651
- id (str): The ID of the crawl job to watch
2652
- app (FirecrawlApp): The FirecrawlApp instance
2653
- data (List[Dict[str, Any]]): List of crawled documents/data
2654
- status (str): Current status of the crawl job
2655
- ws_url (str): WebSocket URL for the crawl job
2656
- event_handlers (dict): Dictionary of event type to list of handler functions
2657
- """
2658
- def __init__(self, id: str, app: FirecrawlApp):
2659
- self.id = id
2660
- self.app = app
2661
- self.data: List[Dict[str, Any]] = []
2662
- self.status = "scraping"
2663
- self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2664
- self.event_handlers = {
2665
- 'done': [],
2666
- 'error': [],
2667
- 'document': []
2668
- }
2669
-
2670
- async def connect(self) -> None:
2671
- """
2672
- Establishes WebSocket connection and starts listening for messages.
2673
- """
2674
- async with websockets.connect(
2675
- self.ws_url,
2676
- max_size=None,
2677
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2678
- ) as websocket:
2679
- await self._listen(websocket)
2680
-
2681
- async def _listen(self, websocket) -> None:
2682
- """
2683
- Listens for incoming WebSocket messages and handles them.
2684
-
2685
- Args:
2686
- websocket: The WebSocket connection object
2687
- """
2688
- async for message in websocket:
2689
- msg = json.loads(message)
2690
- await self._handle_message(msg)
2691
-
2692
- def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2693
- """
2694
- Adds an event handler function for a specific event type.
2695
-
2696
- Args:
2697
- event_type (str): Type of event to listen for ('done', 'error', or 'document')
2698
- handler (Callable): Function to handle the event
2699
- """
2700
- if event_type in self.event_handlers:
2701
- self.event_handlers[event_type].append(handler)
2702
-
2703
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2704
- """
2705
- Dispatches an event to all registered handlers for that event type.
2706
-
2707
- Args:
2708
- event_type (str): Type of event to dispatch
2709
- detail (Dict[str, Any]): Event details/data to pass to handlers
2710
- """
2711
- if event_type in self.event_handlers:
2712
- for handler in self.event_handlers[event_type]:
2713
- handler(detail)
2714
-
2715
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
2716
- """
2717
- Handles incoming WebSocket messages based on their type.
2718
-
2719
- Args:
2720
- msg (Dict[str, Any]): The message to handle
2721
- """
2722
- if msg['type'] == 'done':
2723
- self.status = 'completed'
2724
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2725
- elif msg['type'] == 'error':
2726
- self.status = 'failed'
2727
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2728
- elif msg['type'] == 'catchup':
2729
- self.status = msg['data']['status']
2730
- self.data.extend(msg['data'].get('data', []))
2731
- for doc in self.data:
2732
- self.dispatch_event('document', {'data': doc, 'id': self.id})
2733
- elif msg['type'] == 'document':
2734
- self.data.append(msg['data'])
2735
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2736
-
2737
- class AsyncFirecrawlApp(FirecrawlApp):
2738
- """
2739
- Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2740
- Provides non-blocking alternatives to all FirecrawlApp operations.
2741
- """
2742
-
2743
- async def _async_request(
2744
- self,
2745
- method: str,
2746
- url: str,
2747
- headers: Dict[str, str],
2748
- data: Optional[Dict[str, Any]] = None,
2749
- retries: int = 3,
2750
- backoff_factor: float = 0.5) -> Dict[str, Any]:
2751
- """
2752
- Generic async request method with exponential backoff retry logic.
2753
-
2754
- Args:
2755
- method (str): The HTTP method to use (e.g., "GET" or "POST").
2756
- url (str): The URL to send the request to.
2757
- headers (Dict[str, str]): Headers to include in the request.
2758
- data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2759
- retries (int): Maximum number of retry attempts (default: 3).
2760
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2761
- Delay will be backoff_factor * (2 ** retry_count).
2762
-
2763
- Returns:
2764
- Dict[str, Any]: The parsed JSON response from the server.
2765
-
2766
- Raises:
2767
- aiohttp.ClientError: If the request fails after all retries.
2768
- Exception: If max retries are exceeded or other errors occur.
2769
- """
2770
- async with aiohttp.ClientSession() as session:
2771
- for attempt in range(retries):
2772
- try:
2773
- async with session.request(
2774
- method=method, url=url, headers=headers, json=data
2775
- ) as response:
2776
- if response.status == 502:
2777
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2778
- continue
2779
- if response.status >= 300:
2780
- await self._handle_error(response, f"make {method} request")
2781
- return await response.json()
2782
- except aiohttp.ClientError as e:
2783
- if attempt == retries - 1:
2784
- raise e
2785
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2786
- raise Exception("Max retries exceeded")
2787
-
2788
- async def _async_post_request(
2789
- self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2790
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2791
- """
2792
- Make an async POST request with exponential backoff retry logic.
2793
-
2794
- Args:
2795
- url (str): The URL to send the POST request to.
2796
- data (Dict[str, Any]): The JSON data to include in the request body.
2797
- headers (Dict[str, str]): Headers to include in the request.
2798
- retries (int): Maximum number of retry attempts (default: 3).
2799
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2800
- Delay will be backoff_factor * (2 ** retry_count).
2801
-
2802
- Returns:
2803
- Dict[str, Any]: The parsed JSON response from the server.
2804
-
2805
- Raises:
2806
- aiohttp.ClientError: If the request fails after all retries.
2807
- Exception: If max retries are exceeded or other errors occur.
2808
- """
2809
- return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2810
-
2811
- async def _async_get_request(
2812
- self, url: str, headers: Dict[str, str],
2813
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2814
- """
2815
- Make an async GET request with exponential backoff retry logic.
2816
-
2817
- Args:
2818
- url (str): The URL to send the GET request to.
2819
- headers (Dict[str, str]): Headers to include in the request.
2820
- retries (int): Maximum number of retry attempts (default: 3).
2821
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2822
- Delay will be backoff_factor * (2 ** retry_count).
2823
-
2824
- Returns:
2825
- Dict[str, Any]: The parsed JSON response from the server.
2826
-
2827
- Raises:
2828
- aiohttp.ClientError: If the request fails after all retries.
2829
- Exception: If max retries are exceeded or other errors occur.
2830
- """
2831
- return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2832
-
2833
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2834
- """
2835
- Handle errors from async API responses with detailed error messages.
2836
-
2837
- Args:
2838
- response (aiohttp.ClientResponse): The response object from the failed request
2839
- action (str): Description of the action that was being attempted
2840
-
2841
- Raises:
2842
- aiohttp.ClientError: With a detailed error message based on the response status:
2843
- - 402: Payment Required
2844
- - 408: Request Timeout
2845
- - 409: Conflict
2846
- - 500: Internal Server Error
2847
- - Other: Unexpected error with status code
2848
- """
2849
- try:
2850
- error_data = await response.json()
2851
- error_message = error_data.get('error', 'No error message provided.')
2852
- error_details = error_data.get('details', 'No additional error details provided.')
2853
- except:
2854
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2855
-
2856
- message = await self._get_async_error_message(response.status, action, error_message, error_details)
2857
-
2858
- raise aiohttp.ClientError(message)
2859
-
2860
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2861
- """
2862
- Generate a standardized error message based on HTTP status code for async operations.
2863
-
2864
- Args:
2865
- status_code (int): The HTTP status code from the response
2866
- action (str): Description of the action that was being performed
2867
- error_message (str): The error message from the API response
2868
- error_details (str): Additional error details from the API response
2869
-
2870
- Returns:
2871
- str: A formatted error message
2872
- """
2873
- return self._get_error_message(status_code, action, error_message, error_details)
2874
-
2875
- async def crawl_url_and_watch(
2876
- self,
2877
- url: str,
2878
- params: Optional[CrawlParams] = None,
2879
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2880
- """
2881
- Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2882
-
2883
- Args:
2884
- url (str): Target URL to start crawling from
2885
- params (Optional[CrawlParams]): See CrawlParams model for configuration:
2886
- URL Discovery:
2887
- * includePaths - Patterns of URLs to include
2888
- * excludePaths - Patterns of URLs to exclude
2889
- * maxDepth - Maximum crawl depth
2890
- * maxDiscoveryDepth - Maximum depth for finding new URLs
2891
- * limit - Maximum pages to crawl
2892
-
2893
- Link Following:
2894
- * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2895
- * crawlEntireDomain - Follow parent directory links
2896
- * allowExternalLinks - Follow external domain links
2897
- * ignoreSitemap - Skip sitemap.xml processing
2898
-
2899
- Advanced:
2900
- * scrapeOptions - Page scraping configuration
2901
- * webhook - Notification webhook settings
2902
- * deduplicateSimilarURLs - Remove similar URLs
2903
- * ignoreQueryParameters - Ignore URL parameters
2904
- * regexOnFullURL - Apply regex to full URLs
2905
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2906
-
2907
- Returns:
2908
- AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2909
-
2910
- Raises:
2911
- Exception: If crawl job fails to start
2912
- """
2913
- crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2914
- if crawl_response.get('success') and 'id' in crawl_response:
2915
- return AsyncCrawlWatcher(crawl_response['id'], self)
2916
- else:
2917
- raise Exception("Crawl job failed to start")
2918
-
2919
- async def batch_scrape_urls_and_watch(
2920
- self,
2921
- urls: List[str],
2922
- params: Optional[ScrapeParams] = None,
2923
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2924
- """
2925
- Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2926
-
2927
- Args:
2928
- urls (List[str]): List of URLs to scrape
2929
- params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2930
-
2931
- Content Options:
2932
- * formats - Content formats to retrieve
2933
- * includeTags - HTML tags to include
2934
- * excludeTags - HTML tags to exclude
2935
- * onlyMainContent - Extract main content only
2936
-
2937
- Request Options:
2938
- * headers - Custom HTTP headers
2939
- * timeout - Request timeout (ms)
2940
- * mobile - Use mobile user agent
2941
- * proxy - Proxy type
2942
-
2943
- Extraction Options:
2944
- * extract - Content extraction config
2945
- * jsonOptions - JSON extraction config
2946
- * actions - Actions to perform
2947
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2948
-
2949
- Returns:
2950
- AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2951
-
2952
- Raises:
2953
- Exception: If batch scrape job fails to start
2954
- """
2955
- batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2956
- if batch_response.get('success') and 'id' in batch_response:
2957
- return AsyncCrawlWatcher(batch_response['id'], self)
2958
- else:
2959
- raise Exception("Batch scrape job failed to start")
2960
-
2961
- async def scrape_url(
2962
- self,
2963
- url: str,
2964
- *,
2965
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2966
- include_tags: Optional[List[str]] = None,
2967
- exclude_tags: Optional[List[str]] = None,
2968
- only_main_content: Optional[bool] = None,
2969
- wait_for: Optional[int] = None,
2970
- timeout: Optional[int] = None,
2971
- location: Optional[LocationConfig] = None,
2972
- mobile: Optional[bool] = None,
2973
- skip_tls_verification: Optional[bool] = None,
2974
- remove_base64_images: Optional[bool] = None,
2975
- block_ads: Optional[bool] = None,
2976
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2977
- parse_pdf: Optional[bool] = None,
2978
- extract: Optional[JsonConfig] = None,
2979
- json_options: Optional[JsonConfig] = None,
2980
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
2981
- **kwargs) -> ScrapeResponse[Any]:
2982
- """
2983
- Scrape a single URL asynchronously.
2984
-
2985
- Args:
2986
- url (str): Target URL to scrape
2987
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2988
- include_tags (Optional[List[str]]): HTML tags to include
2989
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2990
- only_main_content (Optional[bool]): Extract main content only
2991
- wait_for (Optional[int]): Wait for a specific element to appear
2992
- timeout (Optional[int]): Request timeout (ms)
2993
- location (Optional[LocationConfig]): Location configuration
2994
- mobile (Optional[bool]): Use mobile user agent
2995
- skip_tls_verification (Optional[bool]): Skip TLS verification
2996
- remove_base64_images (Optional[bool]): Remove base64 images
2997
- block_ads (Optional[bool]): Block ads
2998
- proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
2999
- extract (Optional[JsonConfig]): Content extraction settings
3000
- json_options (Optional[JsonConfig]): JSON extraction settings
3001
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
3002
- **kwargs: Additional parameters to pass to the API
3003
-
3004
- Returns:
3005
- ScrapeResponse with:
3006
- * success - Whether scrape was successful
3007
- * markdown - Markdown content if requested
3008
- * html - HTML content if requested
3009
- * rawHtml - Raw HTML content if requested
3010
- * links - Extracted links if requested
3011
- * screenshot - Screenshot if requested
3012
- * extract - Extracted data if requested
3013
- * json - JSON data if requested
3014
- * error - Error message if scrape failed
3015
-
3016
- Raises:
3017
- Exception: If scraping fails
3018
- """
3019
- # Validate any additional kwargs
3020
- self._validate_kwargs(kwargs, "scrape_url")
3021
-
3022
- headers = self._prepare_headers()
3023
-
3024
- # Build scrape parameters
3025
- scrape_params = {
3026
- 'url': url,
3027
- 'origin': f"python-sdk@{version}"
3028
- }
3029
-
3030
- # Add optional parameters if provided and not None
3031
- if formats:
3032
- scrape_params['formats'] = formats
3033
- if include_tags:
3034
- scrape_params['includeTags'] = include_tags
3035
- if exclude_tags:
3036
- scrape_params['excludeTags'] = exclude_tags
3037
- if only_main_content is not None:
3038
- scrape_params['onlyMainContent'] = only_main_content
3039
- if wait_for:
3040
- scrape_params['waitFor'] = wait_for
3041
- if timeout:
3042
- scrape_params['timeout'] = timeout
3043
- if location:
3044
- scrape_params['location'] = location.dict(exclude_none=True)
3045
- if mobile is not None:
3046
- scrape_params['mobile'] = mobile
3047
- if skip_tls_verification is not None:
3048
- scrape_params['skipTlsVerification'] = skip_tls_verification
3049
- if remove_base64_images is not None:
3050
- scrape_params['removeBase64Images'] = remove_base64_images
3051
- if block_ads is not None:
3052
- scrape_params['blockAds'] = block_ads
3053
- if proxy:
3054
- scrape_params['proxy'] = proxy
3055
- if parse_pdf is not None:
3056
- scrape_params['parsePDF'] = parse_pdf
3057
- if extract is not None:
3058
- extract = self._ensure_schema_dict(extract)
3059
- if isinstance(extract, dict) and "schema" in extract:
3060
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3061
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3062
- if json_options is not None:
3063
- json_options = self._ensure_schema_dict(json_options)
3064
- if isinstance(json_options, dict) and "schema" in json_options:
3065
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3066
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3067
- if actions:
3068
- scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
3069
-
3070
- if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
3071
- scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
3072
- if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
3073
- scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
3074
-
3075
- # Make async request
3076
- endpoint = f'/v1/scrape'
3077
- response = await self._async_post_request(
3078
- f'{self.api_url}{endpoint}',
3079
- scrape_params,
3080
- headers
3081
- )
3082
-
3083
- if response.get('success') and 'data' in response:
3084
- return ScrapeResponse(**response['data'])
3085
- elif "error" in response:
3086
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
3087
- else:
3088
- # Use the response content directly if possible, otherwise a generic message
3089
- error_content = response.get('error', str(response))
3090
- raise Exception(f'Failed to scrape URL. Error: {error_content}')
3091
-
3092
- async def batch_scrape_urls(
3093
- self,
3094
- urls: List[str],
3095
- *,
3096
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3097
- headers: Optional[Dict[str, str]] = None,
3098
- include_tags: Optional[List[str]] = None,
3099
- exclude_tags: Optional[List[str]] = None,
3100
- only_main_content: Optional[bool] = None,
3101
- wait_for: Optional[int] = None,
3102
- timeout: Optional[int] = None,
3103
- location: Optional[LocationConfig] = None,
3104
- mobile: Optional[bool] = None,
3105
- skip_tls_verification: Optional[bool] = None,
3106
- remove_base64_images: Optional[bool] = None,
3107
- block_ads: Optional[bool] = None,
3108
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3109
- extract: Optional[JsonConfig] = None,
3110
- json_options: Optional[JsonConfig] = None,
3111
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3112
- agent: Optional[AgentOptions] = None,
3113
- poll_interval: Optional[int] = 2,
3114
- idempotency_key: Optional[str] = None,
3115
- **kwargs
3116
- ) -> BatchScrapeStatusResponse:
3117
- """
3118
- Asynchronously scrape multiple URLs and monitor until completion.
3119
-
3120
- Args:
3121
- urls (List[str]): URLs to scrape
3122
- formats (Optional[List[Literal]]): Content formats to retrieve
3123
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3124
- include_tags (Optional[List[str]]): HTML tags to include
3125
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3126
- only_main_content (Optional[bool]): Extract main content only
3127
- wait_for (Optional[int]): Wait time in milliseconds
3128
- timeout (Optional[int]): Request timeout in milliseconds
3129
- location (Optional[LocationConfig]): Location configuration
3130
- mobile (Optional[bool]): Use mobile user agent
3131
- skip_tls_verification (Optional[bool]): Skip TLS verification
3132
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3133
- block_ads (Optional[bool]): Block advertisements
3134
- proxy (Optional[Literal]): Proxy type to use
3135
- extract (Optional[JsonConfig]): Content extraction config
3136
- json_options (Optional[JsonConfig]): JSON extraction config
3137
- actions (Optional[List[Union]]): Actions to perform
3138
- agent (Optional[AgentOptions]): Agent configuration
3139
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3140
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3141
- **kwargs: Additional parameters to pass to the API
3142
-
3143
- Returns:
3144
- BatchScrapeStatusResponse with:
3145
- * Scraping status and progress
3146
- * Scraped content for each URL
3147
- * Success/error information
3148
-
3149
- Raises:
3150
- Exception: If batch scrape fails
3151
- """
3152
- # Validate any additional kwargs
3153
- self._validate_kwargs(kwargs, "batch_scrape_urls")
3154
-
3155
- scrape_params = {}
3156
-
3157
- # Add individual parameters
3158
- if formats is not None:
3159
- scrape_params['formats'] = formats
3160
- if headers is not None:
3161
- scrape_params['headers'] = headers
3162
- if include_tags is not None:
3163
- scrape_params['includeTags'] = include_tags
3164
- if exclude_tags is not None:
3165
- scrape_params['excludeTags'] = exclude_tags
3166
- if only_main_content is not None:
3167
- scrape_params['onlyMainContent'] = only_main_content
3168
- if wait_for is not None:
3169
- scrape_params['waitFor'] = wait_for
3170
- if timeout is not None:
3171
- scrape_params['timeout'] = timeout
3172
- if location is not None:
3173
- scrape_params['location'] = location.dict(exclude_none=True)
3174
- if mobile is not None:
3175
- scrape_params['mobile'] = mobile
3176
- if skip_tls_verification is not None:
3177
- scrape_params['skipTlsVerification'] = skip_tls_verification
3178
- if remove_base64_images is not None:
3179
- scrape_params['removeBase64Images'] = remove_base64_images
3180
- if block_ads is not None:
3181
- scrape_params['blockAds'] = block_ads
3182
- if proxy is not None:
3183
- scrape_params['proxy'] = proxy
3184
- if extract is not None:
3185
- extract = self._ensure_schema_dict(extract)
3186
- if isinstance(extract, dict) and "schema" in extract:
3187
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3188
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3189
- if json_options is not None:
3190
- json_options = self._ensure_schema_dict(json_options)
3191
- if isinstance(json_options, dict) and "schema" in json_options:
3192
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3193
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3194
- if actions is not None:
3195
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3196
- if agent is not None:
3197
- scrape_params['agent'] = agent.dict(exclude_none=True)
3198
-
3199
- # Add any additional kwargs
3200
- scrape_params.update(kwargs)
3201
-
3202
- # Create final params object
3203
- final_params = ScrapeParams(**scrape_params)
3204
- params_dict = final_params.dict(exclude_none=True)
3205
- params_dict['urls'] = urls
3206
- params_dict['origin'] = f"python-sdk@{version}"
3207
-
3208
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3209
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3210
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3211
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3212
-
3213
- # Make request
3214
- headers = self._prepare_headers(idempotency_key)
3215
- response = await self._async_post_request(
3216
- f'{self.api_url}/v1/batch/scrape',
3217
- params_dict,
3218
- headers
3219
- )
3220
-
3221
- if response.get('success'):
3222
- try:
3223
- id = response.get('id')
3224
- except:
3225
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3226
- return await self._async_monitor_job_status(id, headers, poll_interval)
3227
- else:
3228
- self._handle_error(response, 'start batch scrape job')
3229
-
3230
-
3231
- async def async_batch_scrape_urls(
3232
- self,
3233
- urls: List[str],
3234
- *,
3235
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3236
- headers: Optional[Dict[str, str]] = None,
3237
- include_tags: Optional[List[str]] = None,
3238
- exclude_tags: Optional[List[str]] = None,
3239
- only_main_content: Optional[bool] = None,
3240
- wait_for: Optional[int] = None,
3241
- timeout: Optional[int] = None,
3242
- location: Optional[LocationConfig] = None,
3243
- mobile: Optional[bool] = None,
3244
- skip_tls_verification: Optional[bool] = None,
3245
- remove_base64_images: Optional[bool] = None,
3246
- block_ads: Optional[bool] = None,
3247
- proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3248
- extract: Optional[JsonConfig] = None,
3249
- json_options: Optional[JsonConfig] = None,
3250
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3251
- agent: Optional[AgentOptions] = None,
3252
- zero_data_retention: Optional[bool] = None,
3253
- idempotency_key: Optional[str] = None,
3254
- **kwargs
3255
- ) -> BatchScrapeResponse:
3256
- """
3257
- Initiate a batch scrape job asynchronously.
3258
-
3259
- Args:
3260
- urls (List[str]): URLs to scrape
3261
- formats (Optional[List[Literal]]): Content formats to retrieve
3262
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3263
- include_tags (Optional[List[str]]): HTML tags to include
3264
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3265
- only_main_content (Optional[bool]): Extract main content only
3266
- wait_for (Optional[int]): Wait time in milliseconds
3267
- timeout (Optional[int]): Request timeout in milliseconds
3268
- location (Optional[LocationConfig]): Location configuration
3269
- mobile (Optional[bool]): Use mobile user agent
3270
- skip_tls_verification (Optional[bool]): Skip TLS verification
3271
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3272
- block_ads (Optional[bool]): Block advertisements
3273
- proxy (Optional[Literal]): Proxy type to use
3274
- extract (Optional[JsonConfig]): Content extraction config
3275
- json_options (Optional[JsonConfig]): JSON extraction config
3276
- actions (Optional[List[Union]]): Actions to perform
3277
- agent (Optional[AgentOptions]): Agent configuration
3278
- zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
3279
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3280
- **kwargs: Additional parameters to pass to the API
3281
-
3282
- Returns:
3283
- BatchScrapeResponse with:
3284
- * success - Whether job started successfully
3285
- * id - Unique identifier for the job
3286
- * url - Status check URL
3287
- * error - Error message if start failed
3288
-
3289
- Raises:
3290
- Exception: If job initiation fails
3291
- """
3292
- # Validate any additional kwargs
3293
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3294
-
3295
- scrape_params = {}
3296
-
3297
- # Add individual parameters
3298
- if formats is not None:
3299
- scrape_params['formats'] = formats
3300
- if headers is not None:
3301
- scrape_params['headers'] = headers
3302
- if include_tags is not None:
3303
- scrape_params['includeTags'] = include_tags
3304
- if exclude_tags is not None:
3305
- scrape_params['excludeTags'] = exclude_tags
3306
- if only_main_content is not None:
3307
- scrape_params['onlyMainContent'] = only_main_content
3308
- if wait_for is not None:
3309
- scrape_params['waitFor'] = wait_for
3310
- if timeout is not None:
3311
- scrape_params['timeout'] = timeout
3312
- if location is not None:
3313
- scrape_params['location'] = location.dict(exclude_none=True)
3314
- if mobile is not None:
3315
- scrape_params['mobile'] = mobile
3316
- if skip_tls_verification is not None:
3317
- scrape_params['skipTlsVerification'] = skip_tls_verification
3318
- if remove_base64_images is not None:
3319
- scrape_params['removeBase64Images'] = remove_base64_images
3320
- if block_ads is not None:
3321
- scrape_params['blockAds'] = block_ads
3322
- if proxy is not None:
3323
- scrape_params['proxy'] = proxy
3324
- if extract is not None:
3325
- extract = self._ensure_schema_dict(extract)
3326
- if isinstance(extract, dict) and "schema" in extract:
3327
- extract["schema"] = self._ensure_schema_dict(extract["schema"])
3328
- scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
3329
- if json_options is not None:
3330
- json_options = self._ensure_schema_dict(json_options)
3331
- if isinstance(json_options, dict) and "schema" in json_options:
3332
- json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
3333
- scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
3334
- if actions is not None:
3335
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3336
- if agent is not None:
3337
- scrape_params['agent'] = agent.dict(exclude_none=True)
3338
- if zero_data_retention is not None:
3339
- scrape_params['zeroDataRetention'] = zero_data_retention
3340
-
3341
- # Add any additional kwargs
3342
- scrape_params.update(kwargs)
3343
-
3344
- # Create final params object
3345
- final_params = ScrapeParams(**scrape_params)
3346
- params_dict = final_params.dict(exclude_none=True)
3347
- params_dict['urls'] = urls
3348
- params_dict['origin'] = f"python-sdk@{version}"
3349
-
3350
- if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
3351
- params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
3352
- if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
3353
- params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
3354
-
3355
- # Make request
3356
- headers = self._prepare_headers(idempotency_key)
3357
- response = await self._async_post_request(
3358
- f'{self.api_url}/v1/batch/scrape',
3359
- params_dict,
3360
- headers
3361
- )
3362
-
3363
- if response.get('status_code') == 200:
3364
- try:
3365
- return BatchScrapeResponse(**response.json())
3366
- except:
3367
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3368
- else:
3369
- self._handle_error(response, 'start batch scrape job')
3370
-
3371
- async def crawl_url(
3372
- self,
3373
- url: str,
3374
- *,
3375
- include_paths: Optional[List[str]] = None,
3376
- exclude_paths: Optional[List[str]] = None,
3377
- max_depth: Optional[int] = None,
3378
- max_discovery_depth: Optional[int] = None,
3379
- limit: Optional[int] = None,
3380
- allow_backward_links: Optional[bool] = None,
3381
- crawl_entire_domain: Optional[bool] = None,
3382
- allow_external_links: Optional[bool] = None,
3383
- ignore_sitemap: Optional[bool] = None,
3384
- scrape_options: Optional[ScrapeOptions] = None,
3385
- webhook: Optional[Union[str, WebhookConfig]] = None,
3386
- deduplicate_similar_urls: Optional[bool] = None,
3387
- ignore_query_parameters: Optional[bool] = None,
3388
- regex_on_full_url: Optional[bool] = None,
3389
- delay: Optional[int] = None,
3390
- allow_subdomains: Optional[bool] = None,
3391
- poll_interval: Optional[int] = 2,
3392
- idempotency_key: Optional[str] = None,
3393
- **kwargs
3394
- ) -> CrawlStatusResponse:
3395
- """
3396
- Crawl a website starting from a URL.
3397
-
3398
- Args:
3399
- url (str): Target URL to start crawling from
3400
- include_paths (Optional[List[str]]): Patterns of URLs to include
3401
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3402
- max_depth (Optional[int]): Maximum crawl depth
3403
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3404
- limit (Optional[int]): Maximum pages to crawl
3405
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3406
- crawl_entire_domain (Optional[bool]): Follow parent directory links
3407
- allow_external_links (Optional[bool]): Follow external domain links
3408
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3409
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3410
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3411
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3412
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3413
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3414
- delay (Optional[int]): Delay in seconds between scrapes
3415
- allow_subdomains (Optional[bool]): Follow subdomains
3416
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3417
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3418
- **kwargs: Additional parameters to pass to the API
3419
-
3420
- Returns:
3421
- CrawlStatusResponse with:
3422
- * Crawling status and progress
3423
- * Crawled page contents
3424
- * Success/error information
3425
-
3426
- Raises:
3427
- Exception: If crawl fails
3428
- """
3429
- # Validate any additional kwargs
3430
- self._validate_kwargs(kwargs, "crawl_url")
3431
-
3432
- crawl_params = {}
3433
-
3434
- # Add individual parameters
3435
- if include_paths is not None:
3436
- crawl_params['includePaths'] = include_paths
3437
- if exclude_paths is not None:
3438
- crawl_params['excludePaths'] = exclude_paths
3439
- if max_depth is not None:
3440
- crawl_params['maxDepth'] = max_depth
3441
- if max_discovery_depth is not None:
3442
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3443
- if limit is not None:
3444
- crawl_params['limit'] = limit
3445
- if crawl_entire_domain is not None:
3446
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
3447
- elif allow_backward_links is not None:
3448
- crawl_params['allowBackwardLinks'] = allow_backward_links
3449
- if allow_external_links is not None:
3450
- crawl_params['allowExternalLinks'] = allow_external_links
3451
- if ignore_sitemap is not None:
3452
- crawl_params['ignoreSitemap'] = ignore_sitemap
3453
- if scrape_options is not None:
3454
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3455
- if webhook is not None:
3456
- crawl_params['webhook'] = webhook
3457
- if deduplicate_similar_urls is not None:
3458
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3459
- if ignore_query_parameters is not None:
3460
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3461
- if regex_on_full_url is not None:
3462
- crawl_params['regexOnFullURL'] = regex_on_full_url
3463
- if delay is not None:
3464
- crawl_params['delay'] = delay
3465
- if allow_subdomains is not None:
3466
- crawl_params['allowSubdomains'] = allow_subdomains
3467
-
3468
- # Add any additional kwargs
3469
- crawl_params.update(kwargs)
3470
-
3471
- # Create final params object
3472
- final_params = CrawlParams(**crawl_params)
3473
- params_dict = final_params.dict(exclude_none=True)
3474
- params_dict['url'] = url
3475
- params_dict['origin'] = f"python-sdk@{version}"
3476
- # Make request
3477
- headers = self._prepare_headers(idempotency_key)
3478
- response = await self._async_post_request(
3479
- f'{self.api_url}/v1/crawl', params_dict, headers)
3480
-
3481
- if response.get('success'):
3482
- try:
3483
- id = response.get('id')
3484
- except:
3485
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3486
- return await self._async_monitor_job_status(id, headers, poll_interval)
3487
- else:
3488
- self._handle_error(response, 'start crawl job')
3489
-
3490
-
3491
- async def async_crawl_url(
3492
- self,
3493
- url: str,
3494
- *,
3495
- include_paths: Optional[List[str]] = None,
3496
- exclude_paths: Optional[List[str]] = None,
3497
- max_depth: Optional[int] = None,
3498
- max_discovery_depth: Optional[int] = None,
3499
- limit: Optional[int] = None,
3500
- allow_backward_links: Optional[bool] = None,
3501
- crawl_entire_domain: Optional[bool] = None,
3502
- allow_external_links: Optional[bool] = None,
3503
- ignore_sitemap: Optional[bool] = None,
3504
- scrape_options: Optional[ScrapeOptions] = None,
3505
- webhook: Optional[Union[str, WebhookConfig]] = None,
3506
- deduplicate_similar_urls: Optional[bool] = None,
3507
- ignore_query_parameters: Optional[bool] = None,
3508
- regex_on_full_url: Optional[bool] = None,
3509
- delay: Optional[int] = None,
3510
- allow_subdomains: Optional[bool] = None,
3511
- poll_interval: Optional[int] = 2,
3512
- idempotency_key: Optional[str] = None,
3513
- **kwargs
3514
- ) -> CrawlResponse:
3515
- """
3516
- Start an asynchronous crawl job.
3517
-
3518
- Args:
3519
- url (str): Target URL to start crawling from
3520
- include_paths (Optional[List[str]]): Patterns of URLs to include
3521
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3522
- max_depth (Optional[int]): Maximum crawl depth
3523
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3524
- limit (Optional[int]): Maximum pages to crawl
3525
- allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3526
- crawl_entire_domain (Optional[bool]): Follow parent directory links
3527
- allow_external_links (Optional[bool]): Follow external domain links
3528
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3529
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3530
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3531
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3532
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3533
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3534
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3535
- **kwargs: Additional parameters to pass to the API
3536
-
3537
- Returns:
3538
- CrawlResponse with:
3539
- * success - Whether crawl started successfully
3540
- * id - Unique identifier for the crawl job
3541
- * url - Status check URL for the crawl
3542
- * error - Error message if start failed
3543
-
3544
- Raises:
3545
- Exception: If crawl initiation fails
3546
- """
3547
- crawl_params = {}
3548
-
3549
- # Add individual parameters
3550
- if include_paths is not None:
3551
- crawl_params['includePaths'] = include_paths
3552
- if exclude_paths is not None:
3553
- crawl_params['excludePaths'] = exclude_paths
3554
- if max_depth is not None:
3555
- crawl_params['maxDepth'] = max_depth
3556
- if max_discovery_depth is not None:
3557
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3558
- if limit is not None:
3559
- crawl_params['limit'] = limit
3560
- if crawl_entire_domain is not None:
3561
- crawl_params['crawlEntireDomain'] = crawl_entire_domain
3562
- elif allow_backward_links is not None:
3563
- crawl_params['allowBackwardLinks'] = allow_backward_links
3564
- if allow_external_links is not None:
3565
- crawl_params['allowExternalLinks'] = allow_external_links
3566
- if ignore_sitemap is not None:
3567
- crawl_params['ignoreSitemap'] = ignore_sitemap
3568
- if scrape_options is not None:
3569
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3570
- if webhook is not None:
3571
- crawl_params['webhook'] = webhook
3572
- if deduplicate_similar_urls is not None:
3573
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3574
- if ignore_query_parameters is not None:
3575
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3576
- if regex_on_full_url is not None:
3577
- crawl_params['regexOnFullURL'] = regex_on_full_url
3578
- if delay is not None:
3579
- crawl_params['delay'] = delay
3580
- if allow_subdomains is not None:
3581
- crawl_params['allowSubdomains'] = allow_subdomains
3582
-
3583
- # Add any additional kwargs
3584
- crawl_params.update(kwargs)
3585
-
3586
- # Create final params object
3587
- final_params = CrawlParams(**crawl_params)
3588
- params_dict = final_params.dict(exclude_none=True)
3589
- params_dict['url'] = url
3590
- params_dict['origin'] = f"python-sdk@{version}"
3591
-
3592
- # Make request
3593
- headers = self._prepare_headers(idempotency_key)
3594
- response = await self._async_post_request(
3595
- f'{self.api_url}/v1/crawl',
3596
- params_dict,
3597
- headers
3598
- )
3599
-
3600
- if response.get('success'):
3601
- try:
3602
- return CrawlResponse(**response)
3603
- except:
3604
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3605
- else:
3606
- self._handle_error(response, 'start crawl job')
3607
-
3608
- async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3609
- """
3610
- Check the status and results of an asynchronous crawl job.
3611
-
3612
- Args:
3613
- id (str): Unique identifier for the crawl job
3614
-
3615
- Returns:
3616
- CrawlStatusResponse containing:
3617
- Status Information:
3618
- * status - Current state (scraping/completed/failed/cancelled)
3619
- * completed - Number of pages crawled
3620
- * total - Total pages to crawl
3621
- * creditsUsed - API credits consumed
3622
- * expiresAt - Data expiration timestamp
3623
-
3624
- Results:
3625
- * data - List of crawled documents
3626
- * next - URL for next page of results (if paginated)
3627
- * success - Whether status check succeeded
3628
- * error - Error message if failed
3629
-
3630
- Raises:
3631
- Exception: If status check fails
3632
- """
3633
- headers = self._prepare_headers()
3634
- endpoint = f'/v1/crawl/{id}'
3635
-
3636
- status_data = await self._async_get_request(
3637
- f'{self.api_url}{endpoint}',
3638
- headers
3639
- )
3640
-
3641
- if status_data.get('status') == 'completed':
3642
- if 'data' in status_data:
3643
- data = status_data['data']
3644
- while 'next' in status_data:
3645
- if len(status_data['data']) == 0:
3646
- break
3647
- next_url = status_data.get('next')
3648
- if not next_url:
3649
- logger.warning("Expected 'next' URL is missing.")
3650
- break
3651
- next_data = await self._async_get_request(next_url, headers)
3652
- data.extend(next_data.get('data', []))
3653
- status_data = next_data
3654
- status_data['data'] = data
3655
- # Create CrawlStatusResponse object from status data
3656
- response = CrawlStatusResponse(
3657
- status=status_data.get('status'),
3658
- total=status_data.get('total'),
3659
- completed=status_data.get('completed'),
3660
- creditsUsed=status_data.get('creditsUsed'),
3661
- expiresAt=status_data.get('expiresAt'),
3662
- data=status_data.get('data'),
3663
- success=False if 'error' in status_data else True
3664
- )
3665
-
3666
- if 'error' in status_data:
3667
- response.error = status_data.get('error')
3668
-
3669
- if 'next' in status_data:
3670
- response.next = status_data.get('next')
3671
-
3672
- return response
3673
-
3674
- async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3675
- """
3676
- Monitor the status of an asynchronous job until completion.
3677
-
3678
- Args:
3679
- id (str): The ID of the job to monitor
3680
- headers (Dict[str, str]): Headers to include in status check requests
3681
- poll_interval (int): Seconds between status checks (default: 2)
3682
-
3683
- Returns:
3684
- CrawlStatusResponse: The job results if completed successfully
3685
-
3686
- Raises:
3687
- Exception: If the job fails or an error occurs during status checks
3688
- """
3689
- while True:
3690
- status_data = await self._async_get_request(
3691
- f'{self.api_url}/v1/crawl/{id}',
3692
- headers
3693
- )
3694
-
3695
- if status_data.get('status') == 'completed':
3696
- if 'data' in status_data:
3697
- data = status_data['data']
3698
- while 'next' in status_data:
3699
- if len(status_data['data']) == 0:
3700
- break
3701
- next_url = status_data.get('next')
3702
- if not next_url:
3703
- logger.warning("Expected 'next' URL is missing.")
3704
- break
3705
- next_data = await self._async_get_request(next_url, headers)
3706
- data.extend(next_data.get('data', []))
3707
- status_data = next_data
3708
- status_data['data'] = data
3709
- return CrawlStatusResponse(**status_data)
3710
- else:
3711
- raise Exception('Job completed but no data was returned')
3712
- elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3713
- await asyncio.sleep(max(poll_interval, 2))
3714
- else:
3715
- raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3716
-
3717
- async def map_url(
3718
- self,
3719
- url: str,
3720
- *,
3721
- search: Optional[str] = None,
3722
- ignore_sitemap: Optional[bool] = None,
3723
- include_subdomains: Optional[bool] = None,
3724
- sitemap_only: Optional[bool] = None,
3725
- limit: Optional[int] = None,
3726
- timeout: Optional[int] = None,
3727
- params: Optional[MapParams] = None) -> MapResponse:
3728
- """
3729
- Asynchronously map and discover links from a URL.
3730
-
3731
- Args:
3732
- url (str): Target URL to map
3733
- params (Optional[MapParams]): See MapParams model:
3734
- Discovery Options:
3735
- * search - Filter pattern for URLs
3736
- * ignoreSitemap - Skip sitemap.xml
3737
- * includeSubdomains - Include subdomain links
3738
- * sitemapOnly - Only use sitemap.xml
3739
-
3740
- Limits:
3741
- * limit - Max URLs to return
3742
- * timeout - Request timeout (ms)
3743
-
3744
- Returns:
3745
- MapResponse with:
3746
- * Discovered URLs
3747
- * Success/error status
3748
-
3749
- Raises:
3750
- Exception: If mapping fails
3751
- """
3752
- map_params = {}
3753
- if params:
3754
- map_params.update(params.dict(exclude_none=True))
3755
-
3756
- # Add individual parameters
3757
- if search is not None:
3758
- map_params['search'] = search
3759
- if ignore_sitemap is not None:
3760
- map_params['ignoreSitemap'] = ignore_sitemap
3761
- if include_subdomains is not None:
3762
- map_params['includeSubdomains'] = include_subdomains
3763
- if sitemap_only is not None:
3764
- map_params['sitemapOnly'] = sitemap_only
3765
- if limit is not None:
3766
- map_params['limit'] = limit
3767
- if timeout is not None:
3768
- map_params['timeout'] = timeout
3769
-
3770
- # Create final params object
3771
- final_params = MapParams(**map_params)
3772
- params_dict = final_params.dict(exclude_none=True)
3773
- params_dict['url'] = url
3774
- params_dict['origin'] = f"python-sdk@{version}"
3775
-
3776
- # Make request
3777
- endpoint = f'/v1/map'
3778
- response = await self._async_post_request(
3779
- f'{self.api_url}{endpoint}',
3780
- params_dict,
3781
- headers={"Authorization": f"Bearer {self.api_key}"}
3782
- )
3783
-
3784
- if response.get('success') and 'links' in response:
3785
- return MapResponse(**response)
3786
- elif 'error' in response:
3787
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
3788
- else:
3789
- raise Exception(f'Failed to map URL. Error: {response}')
3790
-
3791
- async def extract(
3792
- self,
3793
- urls: Optional[List[str]] = None,
3794
- *,
3795
- prompt: Optional[str] = None,
3796
- schema: Optional[Any] = None,
3797
- system_prompt: Optional[str] = None,
3798
- allow_external_links: Optional[bool] = False,
3799
- enable_web_search: Optional[bool] = False,
3800
- show_sources: Optional[bool] = False,
3801
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3802
-
3803
- """
3804
- Asynchronously extract structured information from URLs.
3805
-
3806
- Args:
3807
- urls (Optional[List[str]]): URLs to extract from
3808
- prompt (Optional[str]): Custom extraction prompt
3809
- schema (Optional[Any]): JSON schema/Pydantic model
3810
- system_prompt (Optional[str]): System context
3811
- allow_external_links (Optional[bool]): Follow external links
3812
- enable_web_search (Optional[bool]): Enable web search
3813
- show_sources (Optional[bool]): Include source URLs
3814
- agent (Optional[Dict[str, Any]]): Agent configuration
3815
-
3816
- Returns:
3817
- ExtractResponse with:
3818
- * Structured data matching schema
3819
- * Source information if requested
3820
- * Success/error status
3821
-
3822
- Raises:
3823
- ValueError: If prompt/schema missing or extraction fails
3824
- """
3825
- headers = self._prepare_headers()
3826
-
3827
- if not prompt and not schema:
3828
- raise ValueError("Either prompt or schema is required")
3829
-
3830
- if not urls and not prompt:
3831
- raise ValueError("Either urls or prompt is required")
3832
-
3833
- if schema:
3834
- schema = self._ensure_schema_dict(schema)
3835
-
3836
- request_data = {
3837
- 'urls': urls or [],
3838
- 'allowExternalLinks': allow_external_links,
3839
- 'enableWebSearch': enable_web_search,
3840
- 'showSources': show_sources,
3841
- 'schema': schema,
3842
- 'origin': f'python-sdk@{get_version()}'
3843
- }
3844
-
3845
- # Only add prompt and systemPrompt if they exist
3846
- if prompt:
3847
- request_data['prompt'] = prompt
3848
- if system_prompt:
3849
- request_data['systemPrompt'] = system_prompt
3850
-
3851
- if agent:
3852
- request_data['agent'] = agent
3853
-
3854
- response = await self._async_post_request(
3855
- f'{self.api_url}/v1/extract',
3856
- request_data,
3857
- headers
3858
- )
3859
-
3860
- if response.get('success'):
3861
- job_id = response.get('id')
3862
- if not job_id:
3863
- raise Exception('Job ID not returned from extract request.')
3864
-
3865
- while True:
3866
- status_data = await self._async_get_request(
3867
- f'{self.api_url}/v1/extract/{job_id}',
3868
- headers
3869
- )
3870
-
3871
- if status_data['status'] == 'completed':
3872
- return ExtractResponse(**status_data)
3873
- elif status_data['status'] in ['failed', 'cancelled']:
3874
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3875
-
3876
- await asyncio.sleep(2)
3877
- else:
3878
- raise Exception(f'Failed to extract. Error: {response.get("error")}')
3879
-
3880
- async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3881
- """
3882
- Check the status of an asynchronous batch scrape job.
3883
-
3884
- Args:
3885
- id (str): The ID of the batch scrape job
3886
-
3887
- Returns:
3888
- BatchScrapeStatusResponse containing:
3889
- Status Information:
3890
- * status - Current state (scraping/completed/failed/cancelled)
3891
- * completed - Number of URLs scraped
3892
- * total - Total URLs to scrape
3893
- * creditsUsed - API credits consumed
3894
- * expiresAt - Data expiration timestamp
3895
-
3896
- Results:
3897
- * data - List of scraped documents
3898
- * next - URL for next page of results (if paginated)
3899
- * success - Whether status check succeeded
3900
- * error - Error message if failed
3901
-
3902
- Raises:
3903
- Exception: If status check fails
3904
- """
3905
- headers = self._prepare_headers()
3906
- endpoint = f'/v1/batch/scrape/{id}'
3907
-
3908
- status_data = await self._async_get_request(
3909
- f'{self.api_url}{endpoint}',
3910
- headers
3911
- )
3912
-
3913
- if status_data['status'] == 'completed':
3914
- if 'data' in status_data:
3915
- data = status_data['data']
3916
- while 'next' in status_data:
3917
- if len(status_data['data']) == 0:
3918
- break
3919
- next_url = status_data.get('next')
3920
- if not next_url:
3921
- logger.warning("Expected 'next' URL is missing.")
3922
- break
3923
- next_data = await self._async_get_request(next_url, headers)
3924
- data.extend(next_data.get('data', []))
3925
- status_data = next_data
3926
- status_data['data'] = data
3927
-
3928
- response = BatchScrapeStatusResponse(
3929
- status=status_data.get('status'),
3930
- total=status_data.get('total'),
3931
- completed=status_data.get('completed'),
3932
- creditsUsed=status_data.get('creditsUsed'),
3933
- expiresAt=status_data.get('expiresAt'),
3934
- data=status_data.get('data')
3935
- )
3936
-
3937
- if 'error' in status_data:
3938
- response['error'] = status_data['error']
3939
-
3940
- if 'next' in status_data:
3941
- response['next'] = status_data['next']
3942
-
3943
- return {
3944
- 'success': False if 'error' in status_data else True,
3945
- **response
3946
- }
3947
-
3948
- async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3949
- """
3950
- Get information about errors from an asynchronous batch scrape job.
3951
-
3952
- Args:
3953
- id (str): The ID of the batch scrape job
3954
-
3955
- Returns:
3956
- CrawlErrorsResponse containing:
3957
- errors (List[Dict[str, str]]): List of errors with fields:
3958
- * id (str): Error ID
3959
- * timestamp (str): When the error occurred
3960
- * url (str): URL that caused the error
3961
- * error (str): Error message
3962
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3963
-
3964
- Raises:
3965
- Exception: If error check fails
3966
- """
3967
- headers = self._prepare_headers()
3968
- return await self._async_get_request(
3969
- f'{self.api_url}/v1/batch/scrape/{id}/errors',
3970
- headers
3971
- )
3972
-
3973
- async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3974
- """
3975
- Get information about errors from an asynchronous crawl job.
3976
-
3977
- Args:
3978
- id (str): The ID of the crawl job
3979
-
3980
- Returns:
3981
- CrawlErrorsResponse containing:
3982
- * errors (List[Dict[str, str]]): List of errors with fields:
3983
- - id (str): Error ID
3984
- - timestamp (str): When the error occurred
3985
- - url (str): URL that caused the error
3986
- - error (str): Error message
3987
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3988
-
3989
- Raises:
3990
- Exception: If error check fails
3991
- """
3992
- headers = self._prepare_headers()
3993
- return await self._async_get_request(
3994
- f'{self.api_url}/v1/crawl/{id}/errors',
3995
- headers
3996
- )
3997
-
3998
- async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3999
- """
4000
- Cancel an asynchronous crawl job.
4001
-
4002
- Args:
4003
- id (str): The ID of the crawl job to cancel
4004
-
4005
- Returns:
4006
- Dict[str, Any] containing:
4007
- * success (bool): Whether cancellation was successful
4008
- * error (str, optional): Error message if cancellation failed
4009
-
4010
- Raises:
4011
- Exception: If cancellation fails
4012
- """
4013
- headers = self._prepare_headers()
4014
- async with aiohttp.ClientSession() as session:
4015
- async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
4016
- return await response.json()
4017
-
4018
- async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
4019
- """
4020
- Check the status of an asynchronous extraction job.
4021
-
4022
- Args:
4023
- job_id (str): The ID of the extraction job
4024
-
4025
- Returns:
4026
- ExtractResponse[Any] with:
4027
- * success (bool): Whether request succeeded
4028
- * data (Optional[Any]): Extracted data matching schema
4029
- * error (Optional[str]): Error message if any
4030
- * warning (Optional[str]): Warning message if any
4031
- * sources (Optional[List[str]]): Source URLs if requested
4032
-
4033
- Raises:
4034
- ValueError: If status check fails
4035
- """
4036
- headers = self._prepare_headers()
4037
- try:
4038
- return await self._async_get_request(
4039
- f'{self.api_url}/v1/extract/{job_id}',
4040
- headers
4041
- )
4042
- except Exception as e:
4043
- raise ValueError(str(e))
4044
-
4045
- async def async_extract(
4046
- self,
4047
- urls: Optional[List[str]] = None,
4048
- *,
4049
- prompt: Optional[str] = None,
4050
- schema: Optional[Any] = None,
4051
- system_prompt: Optional[str] = None,
4052
- allow_external_links: Optional[bool] = False,
4053
- enable_web_search: Optional[bool] = False,
4054
- show_sources: Optional[bool] = False,
4055
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
4056
- """
4057
- Initiate an asynchronous extraction job without waiting for completion.
4058
-
4059
- Args:
4060
- urls (Optional[List[str]]): URLs to extract from
4061
- prompt (Optional[str]): Custom extraction prompt
4062
- schema (Optional[Any]): JSON schema/Pydantic model
4063
- system_prompt (Optional[str]): System context
4064
- allow_external_links (Optional[bool]): Follow external links
4065
- enable_web_search (Optional[bool]): Enable web search
4066
- show_sources (Optional[bool]): Include source URLs
4067
- agent (Optional[Dict[str, Any]]): Agent configuration
4068
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
4069
-
4070
- Returns:
4071
- ExtractResponse[Any] with:
4072
- * success (bool): Whether request succeeded
4073
- * data (Optional[Any]): Extracted data matching schema
4074
- * error (Optional[str]): Error message if any
4075
-
4076
- Raises:
4077
- ValueError: If job initiation fails
4078
- """
4079
- headers = self._prepare_headers()
4080
-
4081
- if not prompt and not schema:
4082
- raise ValueError("Either prompt or schema is required")
4083
-
4084
- if not urls and not prompt:
4085
- raise ValueError("Either urls or prompt is required")
4086
-
4087
- if schema:
4088
- schema = self._ensure_schema_dict(schema)
4089
-
4090
- request_data = ExtractResponse(
4091
- urls=urls or [],
4092
- allowExternalLinks=allow_external_links,
4093
- enableWebSearch=enable_web_search,
4094
- showSources=show_sources,
4095
- schema=schema,
4096
- origin=f'python-sdk@{version}'
4097
- )
4098
-
4099
- if prompt:
4100
- request_data['prompt'] = prompt
4101
- if system_prompt:
4102
- request_data['systemPrompt'] = system_prompt
4103
- if agent:
4104
- request_data['agent'] = agent
4105
-
4106
- try:
4107
- return await self._async_post_request(
4108
- f'{self.api_url}/v1/extract',
4109
- request_data,
4110
- headers
4111
- )
4112
- except Exception as e:
4113
- raise ValueError(str(e))
4114
-
4115
- async def generate_llms_text(
4116
- self,
4117
- url: str,
4118
- *,
4119
- max_urls: Optional[int] = None,
4120
- show_full_text: Optional[bool] = None,
4121
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
4122
- """
4123
- Generate LLMs.txt for a given URL and monitor until completion.
4124
-
4125
- Args:
4126
- url (str): Target URL to generate LLMs.txt from
4127
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
4128
- show_full_text (Optional[bool]): Include full text in output (default: False)
4129
- experimental_stream (Optional[bool]): Enable experimental streaming
4130
-
4131
- Returns:
4132
- GenerateLLMsTextStatusResponse containing:
4133
- * success (bool): Whether generation completed successfully
4134
- * status (str): Status of generation (processing/completed/failed)
4135
- * data (Dict[str, str], optional): Generated text with fields:
4136
- - llmstxt (str): Generated LLMs.txt content
4137
- - llmsfulltxt (str, optional): Full version if requested
4138
- * error (str, optional): Error message if generation failed
4139
- * expiresAt (str): When the generated data expires
4140
-
4141
- Raises:
4142
- Exception: If generation fails
4143
- """
4144
- params = {}
4145
- if max_urls is not None:
4146
- params['maxUrls'] = max_urls
4147
- if show_full_text is not None:
4148
- params['showFullText'] = show_full_text
4149
- if experimental_stream is not None:
4150
- params['__experimental_stream'] = experimental_stream
4151
-
4152
- response = await self.async_generate_llms_text(
4153
- url,
4154
- max_urls=max_urls,
4155
- show_full_text=show_full_text,
4156
- cache=cache,
4157
- experimental_stream=experimental_stream
4158
- )
4159
- if not response.get('success') or 'id' not in response:
4160
- return response
4161
-
4162
- job_id = response['id']
4163
- while True:
4164
- status = await self.check_generate_llms_text_status(job_id)
4165
-
4166
- if status['status'] == 'completed':
4167
- return status
4168
- elif status['status'] == 'failed':
4169
- raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
4170
- elif status['status'] != 'processing':
4171
- break
4172
-
4173
- await asyncio.sleep(2)
4174
-
4175
- return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
4176
-
4177
- async def async_generate_llms_text(
4178
- self,
4179
- url: str,
4180
- *,
4181
- max_urls: Optional[int] = None,
4182
- show_full_text: Optional[bool] = None,
4183
- cache: Optional[bool] = None,
4184
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4185
- """
4186
- Initiate an asynchronous LLMs.txt generation job without waiting for completion.
4187
-
4188
- Args:
4189
- url (str): Target URL to generate LLMs.txt from
4190
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
4191
- show_full_text (Optional[bool]): Include full text in output (default: False)
4192
- cache (Optional[bool]): Whether to use cached content if available (default: True)
4193
- experimental_stream (Optional[bool]): Enable experimental streaming
4194
-
4195
- Returns:
4196
- GenerateLLMsTextResponse containing:
4197
- * success (bool): Whether job started successfully
4198
- * id (str): Unique identifier for the job
4199
- * error (str, optional): Error message if start failed
4200
-
4201
- Raises:
4202
- ValueError: If job initiation fails
4203
- """
4204
- params = {}
4205
- if max_urls is not None:
4206
- params['maxUrls'] = max_urls
4207
- if show_full_text is not None:
4208
- params['showFullText'] = show_full_text
4209
- if experimental_stream is not None:
4210
- params['__experimental_stream'] = experimental_stream
4211
-
4212
- params = GenerateLLMsTextParams(
4213
- maxUrls=max_urls,
4214
- showFullText=show_full_text,
4215
- cache=cache,
4216
- __experimental_stream=experimental_stream
4217
- )
4218
-
4219
- headers = self._prepare_headers()
4220
- json_data = {'url': url, **params.dict(exclude_none=True)}
4221
- json_data['origin'] = f"python-sdk@{version}"
4222
-
4223
- try:
4224
- return await self._async_post_request(
4225
- f'{self.api_url}/v1/llmstxt',
4226
- json_data,
4227
- headers
4228
- )
4229
- except Exception as e:
4230
- raise ValueError(str(e))
4231
-
4232
- async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
4233
- """
4234
- Check the status of an asynchronous LLMs.txt generation job.
4235
-
4236
- Args:
4237
- id (str): The ID of the generation job
4238
-
4239
- Returns:
4240
- GenerateLLMsTextStatusResponse containing:
4241
- * success (bool): Whether generation completed successfully
4242
- * status (str): Status of generation (processing/completed/failed)
4243
- * data (Dict[str, str], optional): Generated text with fields:
4244
- - llmstxt (str): Generated LLMs.txt content
4245
- - llmsfulltxt (str, optional): Full version if requested
4246
- * error (str, optional): Error message if generation failed
4247
- * expiresAt (str): When the generated data expires
4248
-
4249
- Raises:
4250
- ValueError: If status check fails
4251
- """
4252
- headers = self._prepare_headers()
4253
- try:
4254
- return await self._async_get_request(
4255
- f'{self.api_url}/v1/llmstxt/{id}',
4256
- headers
4257
- )
4258
- except Exception as e:
4259
- raise ValueError(str(e))
4260
-
4261
- async def deep_research(
4262
- self,
4263
- query: str,
4264
- *,
4265
- max_depth: Optional[int] = None,
4266
- time_limit: Optional[int] = None,
4267
- max_urls: Optional[int] = None,
4268
- analysis_prompt: Optional[str] = None,
4269
- system_prompt: Optional[str] = None,
4270
- __experimental_stream_steps: Optional[bool] = None,
4271
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4272
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4273
- """
4274
- Initiates a deep research operation on a given query and polls until completion.
4275
-
4276
- Args:
4277
- query (str): Research query or topic to investigate
4278
- max_depth (Optional[int]): Maximum depth of research exploration
4279
- time_limit (Optional[int]): Time limit in seconds for research
4280
- max_urls (Optional[int]): Maximum number of URLs to process
4281
- analysis_prompt (Optional[str]): Custom prompt for analysis
4282
- system_prompt (Optional[str]): Custom system prompt
4283
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4284
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4285
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4286
-
4287
- Returns:
4288
- DeepResearchStatusResponse containing:
4289
- * success (bool): Whether research completed successfully
4290
- * status (str): Current state (processing/completed/failed)
4291
- * error (Optional[str]): Error message if failed
4292
- * id (str): Unique identifier for the research job
4293
- * data (Any): Research findings and analysis
4294
- * sources (List[Dict]): List of discovered sources
4295
- * activities (List[Dict]): Research progress log
4296
- * summaries (List[str]): Generated research summaries
4297
-
4298
- Raises:
4299
- Exception: If research fails
4300
- """
4301
- research_params = {}
4302
- if max_depth is not None:
4303
- research_params['maxDepth'] = max_depth
4304
- if time_limit is not None:
4305
- research_params['timeLimit'] = time_limit
4306
- if max_urls is not None:
4307
- research_params['maxUrls'] = max_urls
4308
- if analysis_prompt is not None:
4309
- research_params['analysisPrompt'] = analysis_prompt
4310
- if system_prompt is not None:
4311
- research_params['systemPrompt'] = system_prompt
4312
- if __experimental_stream_steps is not None:
4313
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4314
- research_params = DeepResearchParams(**research_params)
4315
-
4316
- response = await self.async_deep_research(
4317
- query,
4318
- max_depth=max_depth,
4319
- time_limit=time_limit,
4320
- max_urls=max_urls,
4321
- analysis_prompt=analysis_prompt,
4322
- system_prompt=system_prompt
4323
- )
4324
- if not response.get('success') or 'id' not in response:
4325
- return response
4326
-
4327
- job_id = response['id']
4328
- last_activity_count = 0
4329
- last_source_count = 0
4330
-
4331
- while True:
4332
- status = await self.check_deep_research_status(job_id)
4333
-
4334
- if on_activity and 'activities' in status:
4335
- new_activities = status['activities'][last_activity_count:]
4336
- for activity in new_activities:
4337
- on_activity(activity)
4338
- last_activity_count = len(status['activities'])
4339
-
4340
- if on_source and 'sources' in status:
4341
- new_sources = status['sources'][last_source_count:]
4342
- for source in new_sources:
4343
- on_source(source)
4344
- last_source_count = len(status['sources'])
4345
-
4346
- if status['status'] == 'completed':
4347
- return status
4348
- elif status['status'] == 'failed':
4349
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
4350
- elif status['status'] != 'processing':
4351
- break
4352
-
4353
- await asyncio.sleep(2)
4354
-
4355
- return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4356
-
4357
- async def async_deep_research(
4358
- self,
4359
- query: str,
4360
- *,
4361
- max_depth: Optional[int] = None,
4362
- time_limit: Optional[int] = None,
4363
- max_urls: Optional[int] = None,
4364
- analysis_prompt: Optional[str] = None,
4365
- system_prompt: Optional[str] = None,
4366
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4367
- """
4368
- Initiates an asynchronous deep research operation.
4369
-
4370
- Args:
4371
- query (str): Research query or topic to investigate
4372
- max_depth (Optional[int]): Maximum depth of research exploration
4373
- time_limit (Optional[int]): Time limit in seconds for research
4374
- max_urls (Optional[int]): Maximum number of URLs to process
4375
- analysis_prompt (Optional[str]): Custom prompt for analysis
4376
- system_prompt (Optional[str]): Custom system prompt
4377
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4378
-
4379
- Returns:
4380
- Dict[str, Any]: A response containing:
4381
- * success (bool): Whether the research initiation was successful
4382
- * id (str): The unique identifier for the research job
4383
- * error (str, optional): Error message if initiation failed
4384
-
4385
- Raises:
4386
- Exception: If the research initiation fails.
4387
- """
4388
- research_params = {}
4389
- if max_depth is not None:
4390
- research_params['maxDepth'] = max_depth
4391
- if time_limit is not None:
4392
- research_params['timeLimit'] = time_limit
4393
- if max_urls is not None:
4394
- research_params['maxUrls'] = max_urls
4395
- if analysis_prompt is not None:
4396
- research_params['analysisPrompt'] = analysis_prompt
4397
- if system_prompt is not None:
4398
- research_params['systemPrompt'] = system_prompt
4399
- if __experimental_stream_steps is not None:
4400
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4401
- research_params = DeepResearchParams(**research_params)
4402
-
4403
- headers = self._prepare_headers()
4404
-
4405
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
4406
- json_data['origin'] = f"python-sdk@{version}"
4407
-
4408
- try:
4409
- return await self._async_post_request(
4410
- f'{self.api_url}/v1/deep-research',
4411
- json_data,
4412
- headers
4413
- )
4414
- except Exception as e:
4415
- raise ValueError(str(e))
4416
-
4417
- async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4418
- """
4419
- Check the status of a deep research operation.
4420
-
4421
- Args:
4422
- id (str): The ID of the deep research operation.
4423
-
4424
- Returns:
4425
- DeepResearchResponse containing:
4426
-
4427
- Status:
4428
- * success - Whether research completed successfully
4429
- * status - Current state (processing/completed/failed)
4430
- * error - Error message if failed
4431
-
4432
- Results:
4433
- * id - Unique identifier for the research job
4434
- * data - Research findings and analysis
4435
- * sources - List of discovered sources
4436
- * activities - Research progress log
4437
- * summaries - Generated research summaries
4438
-
4439
- Raises:
4440
- Exception: If the status check fails.
4441
- """
4442
- headers = self._prepare_headers()
4443
- try:
4444
- return await self._async_get_request(
4445
- f'{self.api_url}/v1/deep-research/{id}',
4446
- headers
4447
- )
4448
- except Exception as e:
4449
- raise ValueError(str(e))
4450
-
4451
- async def search(
4452
- self,
4453
- query: str,
4454
- *,
4455
- limit: Optional[int] = None,
4456
- tbs: Optional[str] = None,
4457
- filter: Optional[str] = None,
4458
- lang: Optional[str] = None,
4459
- country: Optional[str] = None,
4460
- location: Optional[str] = None,
4461
- timeout: Optional[int] = None,
4462
- scrape_options: Optional[ScrapeOptions] = None,
4463
- params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4464
- **kwargs) -> SearchResponse:
4465
- """
4466
- Asynchronously search for content using Firecrawl.
4467
-
4468
- Args:
4469
- query (str): Search query string
4470
- limit (Optional[int]): Max results (default: 5)
4471
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
4472
- filter (Optional[str]): Custom result filter
4473
- lang (Optional[str]): Language code (default: "en")
4474
- country (Optional[str]): Country code (default: "us")
4475
- location (Optional[str]): Geo-targeting
4476
- timeout (Optional[int]): Request timeout in milliseconds
4477
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4478
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4479
- **kwargs: Additional keyword arguments for future compatibility
4480
-
4481
- Returns:
4482
- SearchResponse: Response containing:
4483
- * success (bool): Whether request succeeded
4484
- * data (List[FirecrawlDocument]): Search results
4485
- * warning (Optional[str]): Warning message if any
4486
- * error (Optional[str]): Error message if any
4487
-
4488
- Raises:
4489
- Exception: If search fails or response cannot be parsed
4490
- """
4491
- # Build search parameters
4492
- search_params = {}
4493
- if params:
4494
- if isinstance(params, dict):
4495
- search_params.update(params)
4496
- else:
4497
- search_params.update(params.dict(exclude_none=True))
4498
-
4499
- # Add individual parameters
4500
- if limit is not None:
4501
- search_params['limit'] = limit
4502
- if tbs is not None:
4503
- search_params['tbs'] = tbs
4504
- if filter is not None:
4505
- search_params['filter'] = filter
4506
- if lang is not None:
4507
- search_params['lang'] = lang
4508
- if country is not None:
4509
- search_params['country'] = country
4510
- if location is not None:
4511
- search_params['location'] = location
4512
- if timeout is not None:
4513
- search_params['timeout'] = timeout
4514
- if scrape_options is not None:
4515
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4516
-
4517
- # Add any additional kwargs
4518
- search_params.update(kwargs)
4519
-
4520
- # Create final params object
4521
- final_params = SearchParams(query=query, **search_params)
4522
- params_dict = final_params.dict(exclude_none=True)
4523
- params_dict['origin'] = f"python-sdk@{version}"
4524
-
4525
- return await self._async_post_request(
4526
- f"{self.api_url}/v1/search",
4527
- params_dict,
4528
- {"Authorization": f"Bearer {self.api_key}"}
4529
- )
4530
-
4531
- class AsyncCrawlWatcher(CrawlWatcher):
4532
- """
4533
- Async version of CrawlWatcher that properly handles async operations.
4534
- """
4535
- def __init__(self, id: str, app: AsyncFirecrawlApp):
4536
- super().__init__(id, app)
4537
-
4538
- async def connect(self) -> None:
4539
- """
4540
- Establishes async WebSocket connection and starts listening for messages.
4541
- """
4542
- async with websockets.connect(
4543
- self.ws_url,
4544
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4545
- ) as websocket:
4546
- await self._listen(websocket)
4547
-
4548
- async def _listen(self, websocket) -> None:
4549
- """
4550
- Listens for incoming WebSocket messages and handles them asynchronously.
4551
-
4552
- Args:
4553
- websocket: The WebSocket connection object
4554
- """
4555
- async for message in websocket:
4556
- msg = json.loads(message)
4557
- await self._handle_message(msg)
4558
-
4559
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
4560
- """
4561
- Handles incoming WebSocket messages based on their type asynchronously.
4562
-
4563
- Args:
4564
- msg (Dict[str, Any]): The message to handle
4565
- """
4566
- if msg['type'] == 'done':
4567
- self.status = 'completed'
4568
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4569
- elif msg['type'] == 'error':
4570
- self.status = 'failed'
4571
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4572
- elif msg['type'] == 'catchup':
4573
- self.status = msg['data']['status']
4574
- self.data.extend(msg['data'].get('data', []))
4575
- for doc in self.data:
4576
- self.dispatch_event('document', {'data': doc, 'id': self.id})
4577
- elif msg['type'] == 'document':
4578
- self.data.append(msg['data'])
4579
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4580
-
4581
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4582
- """
4583
- Handle errors from async API responses.
4584
- """
4585
- try:
4586
- error_data = await response.json()
4587
- error_message = error_data.get('error', 'No error message provided.')
4588
- error_details = error_data.get('details', 'No additional error details provided.')
4589
- except:
4590
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4591
-
4592
- # Use the app's method to get the error message
4593
- message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4594
-
4595
- raise aiohttp.ClientError(message)
4596
-
4597
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4598
- """
4599
- Generate a standardized error message based on HTTP status code for async operations.
4600
-
4601
- Args:
4602
- status_code (int): The HTTP status code from the response
4603
- action (str): Description of the action that was being performed
4604
- error_message (str): The error message from the API response
4605
- error_details (str): Additional error details from the API response
4606
-
4607
- Returns:
4608
- str: A formatted error message
4609
- """
4610
- return self._get_error_message(status_code, action, error_message, error_details)